In [1]:
import pandas as pd
from datetime import datetime, timedelta

In [2]:
config = {
    "comedy": {
        "path": "source_data/kenji/pupil_size_extracted_comedy.csv",
        "start_time": datetime.strptime("2025-06-05 16:25:43", "%Y-%m-%d %H:%M:%S")
    },
    "documentary": {
        "path": "source_data/kenji/pupil_size_extracted_documentary.csv",
        "start_time": datetime.strptime("2025-06-05 17:13:12", "%Y-%m-%d %H:%M:%S")
    },
    "horror": {
        "path": "source_data/kenji/pupil_size_extracted_horror.csv",
        "start_time": datetime.strptime("2025-06-05 17:58:47", "%Y-%m-%d %H:%M:%S")
    },
}

In [3]:
comedy = pd.read_csv(config["comedy"]["path"])
documentary = pd.read_csv(config["documentary"]["path"])
horror = pd.read_csv(config["horror"]["path"])

comedy

Unnamed: 0,success,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,gaze_direction,overall_confidence,pupil_classification,pupil_detected,iris_detected,pupil_center_x,pupil_center_y,iris_center_x,iris_center_y,gaze_magnitude,concentricity_score,pupil_circularity,timestamp
0,True,0.0,11.8,0.0,Center,0.352557,Unknown,True,False,216,7,0,0,0.0,0.0,0.470076,1504.0
1,True,0.0,11.8,0.0,Center,0.291673,Unknown,True,False,37,18,0,0,0.0,0.0,0.388897,1255.0
2,True,0.0,11.8,0.0,Center,0.269764,Unknown,True,False,119,42,0,0,0.0,0.0,0.359685,873.5
3,True,0.0,11.8,0.0,Center,0.343532,Unknown,True,False,135,113,0,0,0.0,0.0,0.458043,599.0
4,True,0.0,11.8,0.0,Center,0.380328,Unknown,True,False,178,129,0,0,0.0,0.0,0.507104,973.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3679,True,0.0,11.8,0.0,Center,0.263127,Unknown,True,False,60,127,0,0,0.0,0.0,0.350836,672.0
3680,True,0.0,11.8,0.0,Center,0.335181,Unknown,True,False,145,107,0,0,0.0,0.0,0.446908,123.0
3681,True,0.0,11.8,0.0,Center,0.478930,Unknown,True,False,7,101,0,0,0.0,0.0,0.638574,1309.0
3682,True,0.0,11.8,0.0,Center,0.329396,Unknown,True,False,4,133,0,0,0.0,0.0,0.439195,1458.0


In [4]:
# drop useless rows 
def clean_rows(df_to_clean):
    return df_to_clean[df_to_clean['success'] == True]

comedy = clean_rows(comedy)
documentary = clean_rows(documentary)
horror = clean_rows(horror)

print(set(comedy.success))
print(set(documentary.success))
print(set(horror.success))

{True}
{True}
{True}


In [5]:
columns_to_drop = ['success', 'pupil_classification', 'pupil_center_x', 'pupil_center_y', 'iris_center_x', 'iris_center_y', 'gaze_magnitude', 'concentricity_score']

comedy.drop(columns=columns_to_drop, inplace=True)
documentary.drop(columns=columns_to_drop, inplace=True)
horror.drop(columns=columns_to_drop, inplace=True)

In [6]:
comedy

Unnamed: 0,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,gaze_direction,overall_confidence,pupil_detected,iris_detected,pupil_circularity,timestamp
0,0.0,11.8,0.0,Center,0.352557,True,False,0.470076,1504.0
1,0.0,11.8,0.0,Center,0.291673,True,False,0.388897,1255.0
2,0.0,11.8,0.0,Center,0.269764,True,False,0.359685,873.5
3,0.0,11.8,0.0,Center,0.343532,True,False,0.458043,599.0
4,0.0,11.8,0.0,Center,0.380328,True,False,0.507104,973.5
...,...,...,...,...,...,...,...,...,...
3679,0.0,11.8,0.0,Center,0.263127,True,False,0.350836,672.0
3680,0.0,11.8,0.0,Center,0.335181,True,False,0.446908,123.0
3681,0.0,11.8,0.0,Center,0.478930,True,False,0.638574,1309.0
3682,0.0,11.8,0.0,Center,0.329396,True,False,0.439195,1458.0


In [7]:
# Convert timestamp (float seconds) to actual datetime
comedy['time'] = comedy['timestamp'].apply(lambda x: config["comedy"]["start_time"] + timedelta(seconds=x))
documentary['time'] = documentary['timestamp'].apply(lambda x: config["documentary"]["start_time"] + timedelta(seconds=x))
horror['time'] = horror['timestamp'].apply(lambda x: config["horror"]["start_time"] + timedelta(seconds=x))

comedy

Unnamed: 0,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,gaze_direction,overall_confidence,pupil_detected,iris_detected,pupil_circularity,timestamp,time
0,0.0,11.8,0.0,Center,0.352557,True,False,0.470076,1504.0,2025-06-05 16:50:47.000
1,0.0,11.8,0.0,Center,0.291673,True,False,0.388897,1255.0,2025-06-05 16:46:38.000
2,0.0,11.8,0.0,Center,0.269764,True,False,0.359685,873.5,2025-06-05 16:40:16.500
3,0.0,11.8,0.0,Center,0.343532,True,False,0.458043,599.0,2025-06-05 16:35:42.000
4,0.0,11.8,0.0,Center,0.380328,True,False,0.507104,973.5,2025-06-05 16:41:56.500
...,...,...,...,...,...,...,...,...,...,...
3679,0.0,11.8,0.0,Center,0.263127,True,False,0.350836,672.0,2025-06-05 16:36:55.000
3680,0.0,11.8,0.0,Center,0.335181,True,False,0.446908,123.0,2025-06-05 16:27:46.000
3681,0.0,11.8,0.0,Center,0.478930,True,False,0.638574,1309.0,2025-06-05 16:47:32.000
3682,0.0,11.8,0.0,Center,0.329396,True,False,0.439195,1458.0,2025-06-05 16:50:01.000


In [8]:
def reset_timestamp_column(df_to_reset):
    df_to_reset = df_to_reset.sort_values(by='time').reset_index(drop=True)
    df_to_reset['timestamp'] = df_to_reset['time'] 
    df_to_reset.drop(columns=['time'], inplace=True)
    return df_to_reset

comedy = reset_timestamp_column(comedy)
documentary = reset_timestamp_column(documentary)
horror = reset_timestamp_column(horror)

In [9]:
# comedy = comedy.set_index('timestamp')
# documentary = documentary.set_index('timestamp')
# horror = horror.set_index('timestamp')

In [10]:
comedy

Unnamed: 0,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,gaze_direction,overall_confidence,pupil_detected,iris_detected,pupil_circularity,timestamp
0,0.0,11.8,0.0,Center,0.458089,True,False,0.610785,2025-06-05 16:25:43.000
1,0.0,11.8,0.0,Center,0.337658,True,False,0.450210,2025-06-05 16:25:43.500
2,0.0,11.8,0.0,Center,0.299254,True,False,0.399006,2025-06-05 16:25:44.000
3,0.0,11.8,0.0,Center,0.346880,True,False,0.462506,2025-06-05 16:25:44.500
4,0.0,11.8,0.0,Center,0.354830,True,False,0.473107,2025-06-05 16:25:45.000
...,...,...,...,...,...,...,...,...,...
3679,0.0,11.8,0.0,Center,0.349275,True,False,0.465700,2025-06-05 16:56:22.500
3680,0.0,11.8,0.0,Center,0.317610,True,False,0.423480,2025-06-05 16:56:23.000
3681,0.0,11.8,0.0,Center,0.243363,True,False,0.324484,2025-06-05 16:56:23.500
3682,0.0,11.8,0.0,Center,0.500000,True,False,0.690612,2025-06-05 16:56:24.000


In [11]:
numeric_cols = ['pupil_diameter_mm', 'iris_diameter_mm', 'pupil_iris_ratio', 'overall_confidence','pupil_circularity', 'timestamp']
non_numeric_cols = ['gaze_direction', 'pupil_detected', 'iris_detected']

def aggregate_data(df_to_aggregate):
    df_to_aggregate['timestamp'] = pd.to_datetime(df_to_aggregate['timestamp']).dt.tz_localize(None)
    df_to_aggregate['time_rounded'] = df_to_aggregate['timestamp'].dt.floor('1s')
    

    # Function to apply to each group
    def agg_group(group):
        numeric_part = group[numeric_cols].mean()
        best_row = group.loc[group['overall_confidence'].idxmax(), non_numeric_cols]
        return pd.concat([numeric_part, best_row])

    aggregated = df_to_aggregate.groupby('time_rounded').apply(agg_group).reset_index()
    return aggregated

In [12]:
comedy_agg = aggregate_data(comedy)
documentary_agg = aggregate_data(documentary)
horror_agg = aggregate_data(horror)

  aggregated = df_to_aggregate.groupby('time_rounded').apply(agg_group).reset_index()
  aggregated = df_to_aggregate.groupby('time_rounded').apply(agg_group).reset_index()
  aggregated = df_to_aggregate.groupby('time_rounded').apply(agg_group).reset_index()


In [13]:
comedy_agg

Unnamed: 0,time_rounded,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,overall_confidence,pupil_circularity,timestamp,gaze_direction,pupil_detected,iris_detected
0,2025-06-05 16:25:43,0.0,11.8,0.0,0.397873,0.530498,2025-06-05 16:25:43.249999872,Center,True,False
1,2025-06-05 16:25:44,0.0,11.8,0.0,0.323067,0.430756,2025-06-05 16:25:44.249999872,Center,True,False
2,2025-06-05 16:25:45,0.0,11.8,0.0,0.319347,0.425796,2025-06-05 16:25:45.249999872,Center,True,False
3,2025-06-05 16:25:46,0.0,11.8,0.0,0.258393,0.344524,2025-06-05 16:25:46.249999872,Center,True,False
4,2025-06-05 16:25:47,0.0,11.8,0.0,0.320713,0.427618,2025-06-05 16:25:47.249999872,Center,True,False
...,...,...,...,...,...,...,...,...,...,...
1837,2025-06-05 16:56:20,0.0,11.8,0.0,0.248906,0.331875,2025-06-05 16:56:20.249999872,Center,True,False
1838,2025-06-05 16:56:21,0.0,11.8,0.0,0.283869,0.378492,2025-06-05 16:56:21.249999872,Center,True,False
1839,2025-06-05 16:56:22,0.0,11.8,0.0,0.326627,0.435503,2025-06-05 16:56:22.249999872,Center,True,False
1840,2025-06-05 16:56:23,0.0,11.8,0.0,0.280486,0.373982,2025-06-05 16:56:23.249999872,Center,True,False


In [14]:
comedy_agg.drop(columns=['timestamp'], inplace=True)
documentary_agg.drop(columns=['timestamp'], inplace=True)
horror_agg.drop(columns=['timestamp'], inplace=True)

comedy_agg['timestamp'] = comedy_agg['time_rounded']
documentary_agg['timestamp'] = documentary_agg['time_rounded']
horror_agg['timestamp'] = horror_agg['time_rounded']

comedy_agg.drop(columns=['time_rounded'], inplace=True)
documentary_agg.drop(columns=['time_rounded'], inplace=True)
horror_agg.drop(columns=['time_rounded'], inplace=True)

In [15]:
comedy_agg.set_index('timestamp', inplace=True)
documentary_agg.set_index('timestamp', inplace=True)
horror_agg.set_index('timestamp', inplace=True)

In [16]:
comedy_agg

Unnamed: 0_level_0,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,overall_confidence,pupil_circularity,gaze_direction,pupil_detected,iris_detected
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-06-05 16:25:43,0.0,11.8,0.0,0.397873,0.530498,Center,True,False
2025-06-05 16:25:44,0.0,11.8,0.0,0.323067,0.430756,Center,True,False
2025-06-05 16:25:45,0.0,11.8,0.0,0.319347,0.425796,Center,True,False
2025-06-05 16:25:46,0.0,11.8,0.0,0.258393,0.344524,Center,True,False
2025-06-05 16:25:47,0.0,11.8,0.0,0.320713,0.427618,Center,True,False
...,...,...,...,...,...,...,...,...
2025-06-05 16:56:20,0.0,11.8,0.0,0.248906,0.331875,Center,True,False
2025-06-05 16:56:21,0.0,11.8,0.0,0.283869,0.378492,Center,True,False
2025-06-05 16:56:22,0.0,11.8,0.0,0.326627,0.435503,Center,True,False
2025-06-05 16:56:23,0.0,11.8,0.0,0.280486,0.373982,Center,True,False


In [17]:
documentary_agg

Unnamed: 0_level_0,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,overall_confidence,pupil_circularity,gaze_direction,pupil_detected,iris_detected
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-06-05 17:13:12,0.000000,11.8,0.000000,0.280724,0.374299,Center,True,False
2025-06-05 17:13:13,0.000000,11.8,0.000000,0.334065,0.445420,Center,True,False
2025-06-05 17:13:14,0.000000,11.8,0.000000,0.289081,0.385442,Center,True,False
2025-06-05 17:13:15,0.000000,11.8,0.000000,0.373520,0.498027,Center,True,False
2025-06-05 17:13:16,0.000000,11.8,0.000000,0.323767,0.431690,Center,True,False
...,...,...,...,...,...,...,...,...
2025-06-05 17:44:06,0.000000,11.8,0.000000,0.310843,0.414457,Center,True,False
2025-06-05 17:44:07,0.000000,11.8,0.000000,0.345671,0.460895,Center,True,False
2025-06-05 17:44:08,0.000000,11.8,0.000000,0.262832,0.350442,Center,True,False
2025-06-05 17:44:09,0.000000,11.8,0.000000,0.253057,0.337409,Center,True,False


In [18]:
horror_agg

Unnamed: 0_level_0,pupil_diameter_mm,iris_diameter_mm,pupil_iris_ratio,overall_confidence,pupil_circularity,gaze_direction,pupil_detected,iris_detected
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-06-05 17:58:47,0.0,11.8,0.0,0.341908,0.455878,Center,True,False
2025-06-05 17:58:48,0.0,11.8,0.0,0.349734,0.466312,Center,True,False
2025-06-05 17:58:49,0.0,11.8,0.0,0.389688,0.528438,Center,True,False
2025-06-05 17:58:50,0.0,11.8,0.0,0.463993,0.618657,Center,True,False
2025-06-05 17:58:51,0.0,11.8,0.0,0.429819,0.573093,Center,True,False
...,...,...,...,...,...,...,...,...
2025-06-05 18:27:15,0.0,11.8,0.0,0.334241,0.445655,Center,True,False
2025-06-05 18:27:16,0.0,11.8,0.0,0.312355,0.416473,Center,True,False
2025-06-05 18:27:17,0.0,11.8,0.0,0.366722,0.488962,Center,True,False
2025-06-05 18:27:18,0.0,11.8,0.0,0.345114,0.460152,Center,True,False


In [21]:
comedy_agg.to_csv('../data_collection/generated_data/comedy_ps.csv')
documentary_agg.to_csv('../data_collection/generated_data/documentary_ps.csv')
horror_agg.to_csv('../data_collection/generated_data/horror_ps.csv')