In [1]:
import pandas as pd
from datetime import datetime, timedelta

In [2]:
config = {
    "comedy": {
        "path": "source_data/clara/pupil_size_extracted_comedy.csv",
        "start_time": datetime.strptime("2025-06-07 10:29:54", "%Y-%m-%d %H:%M:%S")
    },
    "documentary": {
        "path": "source_data/clara/pupil_size_extracted_documentary.csv",
        "start_time": datetime.strptime("2025-06-07 11:30:09", "%Y-%m-%d %H:%M:%S")
    },
    "horror": {
        "path": "source_data/clara/pupil_size_extracted_horror.csv",
        "start_time": datetime.strptime("2025-06-07 12:55:58", "%Y-%m-%d %H:%M:%S")
    },
}

In [3]:
comedy = pd.read_csv(config["comedy"]["path"])
documentary = pd.read_csv(config["documentary"]["path"])
horror = pd.read_csv(config["horror"]["path"])

comedy

Unnamed: 0,success,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,gaze_direction,overall_confidence,pupil_classification,pupil_detected,iris_detected,pupil_center_x,pupil_center_y,iris_center_x,iris_center_y,gaze_magnitude,concentricity_score,pupil_circularity,timestamp
0,True,0.000000,11.8,0.000000,Center,0.370172,Unknown,True,False,217,9,0,0,0.000000,0.0,0.493562,1255.0
1,True,0.000000,11.8,0.000000,Center,0.255555,Unknown,True,False,242,65,0,0,0.000000,0.0,0.340740,873.5
2,True,0.000000,11.8,0.000000,Center,0.267208,Unknown,True,False,4,75,0,0,0.000000,0.0,0.356278,599.0
3,True,0.000000,11.8,0.000000,Center,0.264557,Unknown,True,False,8,74,0,0,0.000000,0.0,0.352743,973.5
4,True,1.493671,11.8,0.126582,Down,0.486246,Miosis,True,True,104,202,118,148,0.706143,0.0,0.329369,499.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2675,True,0.000000,11.8,0.000000,Center,0.342467,Unknown,True,False,91,109,0,0,0.000000,0.0,0.456623,1209.0
2676,True,0.000000,11.8,0.000000,Center,0.252881,Unknown,True,False,4,77,0,0,0.000000,0.0,0.337174,672.0
2677,True,0.000000,11.8,0.000000,Center,0.260699,Unknown,True,False,86,106,0,0,0.000000,0.0,0.347598,123.0
2678,True,0.000000,11.8,0.000000,Center,0.310214,Unknown,True,False,86,65,0,0,0.000000,0.0,0.413619,1309.0


In [4]:
# drop useless rows 
def clean_rows(df_to_clean):
    return df_to_clean[df_to_clean['success'] == True]

comedy = clean_rows(comedy)
documentary = clean_rows(documentary)
horror = clean_rows(horror)

print(set(comedy.success))
print(set(documentary.success))
print(set(horror.success))

{True}
{True}
{True}


In [5]:
columns_to_drop = ['success', 'pupil_classification', 'pupil_center_x', 'pupil_center_y', 'iris_center_x', 'iris_center_y', 'gaze_magnitude', 'concentricity_score']

comedy.drop(columns=columns_to_drop, inplace=True)
documentary.drop(columns=columns_to_drop, inplace=True)
horror.drop(columns=columns_to_drop, inplace=True)

In [6]:
comedy

Unnamed: 0,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,gaze_direction,overall_confidence,pupil_detected,iris_detected,pupil_circularity,timestamp
0,0.000000,11.8,0.000000,Center,0.370172,True,False,0.493562,1255.0
1,0.000000,11.8,0.000000,Center,0.255555,True,False,0.340740,873.5
2,0.000000,11.8,0.000000,Center,0.267208,True,False,0.356278,599.0
3,0.000000,11.8,0.000000,Center,0.264557,True,False,0.352743,973.5
4,1.493671,11.8,0.126582,Down,0.486246,True,True,0.329369,499.0
...,...,...,...,...,...,...,...,...,...
2675,0.000000,11.8,0.000000,Center,0.342467,True,False,0.456623,1209.0
2676,0.000000,11.8,0.000000,Center,0.252881,True,False,0.337174,672.0
2677,0.000000,11.8,0.000000,Center,0.260699,True,False,0.347598,123.0
2678,0.000000,11.8,0.000000,Center,0.310214,True,False,0.413619,1309.0


In [7]:
# Convert timestamp (float seconds) to actual datetime
comedy['time'] = comedy['timestamp'].apply(lambda x: config["comedy"]["start_time"] + timedelta(seconds=x))
documentary['time'] = documentary['timestamp'].apply(lambda x: config["documentary"]["start_time"] + timedelta(seconds=x))
horror['time'] = horror['timestamp'].apply(lambda x: config["horror"]["start_time"] + timedelta(seconds=x))

comedy

Unnamed: 0,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,gaze_direction,overall_confidence,pupil_detected,iris_detected,pupil_circularity,timestamp,time
0,0.000000,11.8,0.000000,Center,0.370172,True,False,0.493562,1255.0,2025-06-07 10:50:49.000
1,0.000000,11.8,0.000000,Center,0.255555,True,False,0.340740,873.5,2025-06-07 10:44:27.500
2,0.000000,11.8,0.000000,Center,0.267208,True,False,0.356278,599.0,2025-06-07 10:39:53.000
3,0.000000,11.8,0.000000,Center,0.264557,True,False,0.352743,973.5,2025-06-07 10:46:07.500
4,1.493671,11.8,0.126582,Down,0.486246,True,True,0.329369,499.0,2025-06-07 10:38:13.000
...,...,...,...,...,...,...,...,...,...,...
2675,0.000000,11.8,0.000000,Center,0.342467,True,False,0.456623,1209.0,2025-06-07 10:50:03.000
2676,0.000000,11.8,0.000000,Center,0.252881,True,False,0.337174,672.0,2025-06-07 10:41:06.000
2677,0.000000,11.8,0.000000,Center,0.260699,True,False,0.347598,123.0,2025-06-07 10:31:57.000
2678,0.000000,11.8,0.000000,Center,0.310214,True,False,0.413619,1309.0,2025-06-07 10:51:43.000


In [8]:
def reset_timestamp_column(df_to_reset):
    df_to_reset = df_to_reset.sort_values(by='time').reset_index(drop=True)
    df_to_reset['timestamp'] = df_to_reset['time'] 
    df_to_reset.drop(columns=['time'], inplace=True)
    return df_to_reset

comedy = reset_timestamp_column(comedy)
documentary = reset_timestamp_column(documentary)
horror = reset_timestamp_column(horror)

In [9]:
# comedy = comedy.set_index('timestamp')
# documentary = documentary.set_index('timestamp')
# horror = horror.set_index('timestamp')

In [9]:
comedy

Unnamed: 0,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,gaze_direction,overall_confidence,pupil_detected,iris_detected,pupil_circularity,timestamp
0,0.0,11.8,0.0,Center,0.276681,True,False,0.368909,2025-06-07 10:29:54.000
1,0.0,11.8,0.0,Center,0.320633,True,False,0.427511,2025-06-07 10:29:54.500
2,0.0,11.8,0.0,Center,0.235643,True,False,0.314190,2025-06-07 10:29:55.000
3,0.0,11.8,0.0,Center,0.334706,True,False,0.446275,2025-06-07 10:29:55.500
4,0.0,11.8,0.0,Center,0.500000,True,False,0.681159,2025-06-07 10:29:56.000
...,...,...,...,...,...,...,...,...,...
2675,0.0,11.8,0.0,Center,0.244683,True,False,0.326244,2025-06-07 10:52:11.500
2676,0.0,11.8,0.0,Center,0.451639,True,False,0.602186,2025-06-07 10:52:12.000
2677,0.0,11.8,0.0,Center,0.246362,True,False,0.328483,2025-06-07 10:52:12.500
2678,0.0,11.8,0.0,Center,0.282495,True,False,0.376660,2025-06-07 10:52:13.000


In [10]:
numeric_cols = ['pupil_diameter_mm', 'iris_diameter_mm', 'pupil_iris_ratio', 'overall_confidence','pupil_circularity', 'timestamp']
non_numeric_cols = ['gaze_direction', 'pupil_detected', 'iris_detected']

def aggregate_data(df_to_aggregate):
    df_to_aggregate['timestamp'] = pd.to_datetime(df_to_aggregate['timestamp']).dt.tz_localize(None)
    df_to_aggregate['time_rounded'] = df_to_aggregate['timestamp'].dt.floor('1s')
    

    # Function to apply to each group
    def agg_group(group):
        numeric_part = group[numeric_cols].mean()
        best_row = group.loc[group['overall_confidence'].idxmax(), non_numeric_cols]
        return pd.concat([numeric_part, best_row])

    aggregated = df_to_aggregate.groupby('time_rounded').apply(agg_group).reset_index()
    return aggregated

In [11]:
comedy_agg = aggregate_data(comedy)
documentary_agg = aggregate_data(documentary)
horror_agg = aggregate_data(horror)

  aggregated = df_to_aggregate.groupby('time_rounded').apply(agg_group).reset_index()
  aggregated = df_to_aggregate.groupby('time_rounded').apply(agg_group).reset_index()
  aggregated = df_to_aggregate.groupby('time_rounded').apply(agg_group).reset_index()


In [12]:
comedy_agg

Unnamed: 0,time_rounded,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,overall_confidence,pupil_circularity,timestamp,gaze_direction,pupil_detected,iris_detected
0,2025-06-07 10:29:54,0.000000,11.8,0.00000,0.298657,0.398210,2025-06-07 10:29:54.249999872,Center,True,False
1,2025-06-07 10:29:55,0.000000,11.8,0.00000,0.285175,0.380233,2025-06-07 10:29:55.249999872,Center,True,False
2,2025-06-07 10:29:56,0.000000,11.8,0.00000,0.395016,0.533934,2025-06-07 10:29:56.249999872,Center,True,False
3,2025-06-07 10:29:57,0.000000,11.8,0.00000,0.323443,0.431258,2025-06-07 10:29:57.249999872,Center,True,False
4,2025-06-07 10:29:58,0.000000,11.8,0.00000,0.287843,0.383791,2025-06-07 10:29:58.249999872,Center,True,False
...,...,...,...,...,...,...,...,...,...,...
1335,2025-06-07 10:52:09,0.000000,11.8,0.00000,0.299032,0.398709,2025-06-07 10:52:09.249999872,Center,True,False
1336,2025-06-07 10:52:10,0.870492,11.8,0.07377,0.454073,0.412366,2025-06-07 10:52:10.249999872,Up-Left,True,True
1337,2025-06-07 10:52:11,0.000000,11.8,0.00000,0.372341,0.507924,2025-06-07 10:52:11.249999872,Center,True,False
1338,2025-06-07 10:52:12,0.000000,11.8,0.00000,0.349001,0.465334,2025-06-07 10:52:12.249999872,Center,True,False


In [13]:
comedy_agg.drop(columns=['timestamp'], inplace=True)
documentary_agg.drop(columns=['timestamp'], inplace=True)
horror_agg.drop(columns=['timestamp'], inplace=True)

comedy_agg['timestamp'] = comedy_agg['time_rounded']
documentary_agg['timestamp'] = documentary_agg['time_rounded']
horror_agg['timestamp'] = horror_agg['time_rounded']

comedy_agg.drop(columns=['time_rounded'], inplace=True)
documentary_agg.drop(columns=['time_rounded'], inplace=True)
horror_agg.drop(columns=['time_rounded'], inplace=True)

In [14]:
comedy_agg.set_index('timestamp', inplace=True)
documentary_agg.set_index('timestamp', inplace=True)
horror_agg.set_index('timestamp', inplace=True)

In [15]:
comedy_agg

Unnamed: 0_level_0,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,overall_confidence,pupil_circularity,gaze_direction,pupil_detected,iris_detected
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-06-07 10:29:54,0.000000,11.8,0.00000,0.298657,0.398210,Center,True,False
2025-06-07 10:29:55,0.000000,11.8,0.00000,0.285175,0.380233,Center,True,False
2025-06-07 10:29:56,0.000000,11.8,0.00000,0.395016,0.533934,Center,True,False
2025-06-07 10:29:57,0.000000,11.8,0.00000,0.323443,0.431258,Center,True,False
2025-06-07 10:29:58,0.000000,11.8,0.00000,0.287843,0.383791,Center,True,False
...,...,...,...,...,...,...,...,...
2025-06-07 10:52:09,0.000000,11.8,0.00000,0.299032,0.398709,Center,True,False
2025-06-07 10:52:10,0.870492,11.8,0.07377,0.454073,0.412366,Up-Left,True,True
2025-06-07 10:52:11,0.000000,11.8,0.00000,0.372341,0.507924,Center,True,False
2025-06-07 10:52:12,0.000000,11.8,0.00000,0.349001,0.465334,Center,True,False


In [16]:
documentary_agg

Unnamed: 0_level_0,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,overall_confidence,pupil_circularity,gaze_direction,pupil_detected,iris_detected
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-06-07 11:30:09,0.000000,11.8,0.000000,0.143792,0.191722,Center,True,False
2025-06-07 11:30:10,0.000000,11.8,0.000000,0.269328,0.359104,Center,True,False
2025-06-07 11:30:11,0.000000,11.8,0.000000,0.151042,0.201389,Center,True,False
2025-06-07 11:30:12,0.842857,11.8,0.071429,0.373550,0.337586,Left,True,True
2025-06-07 11:30:13,0.000000,11.8,0.000000,0.321979,0.429306,Center,True,False
...,...,...,...,...,...,...,...,...
2025-06-07 11:57:09,0.000000,11.8,0.000000,0.379036,0.505382,Center,True,False
2025-06-07 11:57:10,0.000000,11.8,0.000000,0.391316,0.572663,Center,True,False
2025-06-07 11:57:11,0.000000,11.8,0.000000,0.405394,0.540525,Center,True,False
2025-06-07 11:57:12,1.154348,11.8,0.097826,0.496705,0.406121,Down-Left,True,True


In [17]:
horror_agg

Unnamed: 0_level_0,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,overall_confidence,pupil_circularity,gaze_direction,pupil_detected,iris_detected
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-06-07 12:55:58,0.000000,11.8,0.000000,0.357953,0.477271,Center,True,False
2025-06-07 12:55:59,2.022857,11.8,0.171429,0.472566,0.343026,Up,True,True
2025-06-07 12:56:00,0.000000,11.8,0.000000,0.297844,0.397126,Center,True,False
2025-06-07 12:56:01,2.269231,11.8,0.192308,0.400992,0.376923,Up,True,True
2025-06-07 12:56:02,0.000000,11.8,0.000000,0.289910,0.386546,Center,True,False
...,...,...,...,...,...,...,...,...
2025-06-07 13:16:09,0.000000,11.8,0.000000,0.304738,0.406317,Center,True,False
2025-06-07 13:16:10,0.000000,11.8,0.000000,0.500000,0.691523,Center,True,False
2025-06-07 13:16:11,0.000000,11.8,0.000000,0.426409,0.568545,Center,True,False
2025-06-07 13:16:12,0.000000,11.8,0.000000,0.363827,0.485957,Center,True,False


In [18]:
comedy_agg.to_csv('../data_collection/generated_data/clara/comedy_ps.csv')
documentary_agg.to_csv('../data_collection/generated_data/clara/documentary_ps.csv')
horror_agg.to_csv('../data_collection/generated_data/clara/horror_ps.csv')