# Descriptive analysis (per image)

This notebook builds per-image metrics by combining:
- feature_engineering_summary.csv (file-level features)
- Raw fixation CSVs in the fixations folder (for pupil metrics)

Outputs a per-image table with:
- number_of_fixations (sum over files/participants)
- fixation_duration_mean_weighted, fixation_duration_median_approx
- view_time_total_sum, scanpath_length_mean, BCEA_68_mean, BCEA_95_mean
- pupil metrics per image (mean and std for each available pupil column)


In [2]:
# Setup and paths
import os
from pathlib import Path
import pandas as pd
import numpy as np

# Resolve project root (assumes this file in data_analysis/descriptive_analysis)
nb_dir = Path.cwd()
project_root = nb_dir.parents[2] if len(nb_dir.parents) >= 2 else nb_dir

summary_candidates = [
    nb_dir.parent / "feature_engineering" / "feature_engineering_summary.csv",
    project_root / "data_analysis" / "feature_engineering" / "feature_engineering_summary.csv",
]
summary_path = next((p for p in summary_candidates if p.exists()), None)
if summary_path is None:
    raise FileNotFoundError("feature_engineering_summary.csv not found.")

fixations_candidates = [
    project_root / "fixations",
    nb_dir.parents[3] / "fixations" if len(nb_dir.parents) >= 3 else nb_dir / "fixations",
    Path(r"c:\\Users\\SWixforth\\Uni\\eye-tracking-ai\\fixations"),
]
fixations_dir = next((p for p in fixations_candidates if p.exists()), None)
if fixations_dir is None:
    raise FileNotFoundError("fixations folder not found.")

print(f"Using summary: {summary_path}\nUsing fixations: {fixations_dir}")


Using summary: c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\feature_engineering\feature_engineering_summary.csv
Using fixations: c:\Users\SWixforth\Uni\eye-tracking-ai\fixations


In [3]:
# Load feature_engineering summary
summary = pd.read_csv(summary_path)

# Ensure image_id is string with zero padding as in filenames
summary["image_id"] = summary["image_id"].astype(str).str.zfill(3)

# Per-image aggregates from summary
per_image_summary = (
    summary.groupby("image_id").agg(
        number_of_fixations=("n_fix", "sum"),
        view_time_total_sum=("view_time_total", "sum"),
        fixation_duration_mean_weighted=("fix_dur_mean", "mean"),
        fixation_duration_median_approx=("fix_dur_median", "median"),
        scanpath_length_mean=("scanpath_length", "mean"),
        BCEA_68_mean=("bcea_68", "mean"),
        BCEA_95_mean=("bcea_95", "mean"),
        primary_label_top=("primary_label", lambda s: s.mode().iloc[0] if not s.mode().empty else np.nan),
    )
    .reset_index()
)

per_image_summary.head()

Unnamed: 0,image_id,number_of_fixations,view_time_total_sum,fixation_duration_mean_weighted,fixation_duration_median_approx,scanpath_length_mean,BCEA_68_mean,BCEA_95_mean,primary_label_top
0,1,1159,425375.042,284.32902,232.363,3688.463661,72008.690066,189496.552805,meme
1,2,1284,452560.267,274.577679,232.927,2665.561573,60359.429235,158840.603249,meme
2,3,1110,411435.771,296.86351,245.25175,3022.834081,40009.274302,105287.563953,meme
3,4,1354,469790.081,280.162546,216.588,3536.890776,87693.362497,230772.006571,meme
4,5,1359,460515.114,265.496762,216.431,3734.527053,90696.081085,238673.897592,meme


In [4]:
# Build pupil metrics per image from raw fixations
import glob
import re

fname_re = re.compile(r"^P(?P<participant>\d+)_id(?P<image>\d+).+\.csv$")

pupil_rows = []
for fp in glob.glob(str(fixations_dir / "*.csv")):
    name = os.path.basename(fp)
    m = fname_re.match(name)
    if not m:
        continue
    image_id = m.group("image")
    image_id = str(image_id).zfill(3)
    try:
        df = pd.read_csv(fp)
    except Exception as e:
        print(f"Failed to read {name}: {e}")
        continue

    # Identify pupil columns (common names: pupil, pupil_left/right, pupil_size, pupil_diameter)
    # Also include avg_pupil_size if present (in mm), even though it doesn't start with 'pupil'
    lower_cols = {c.lower(): c for c in df.columns}
    pupil_like = [c for c in df.columns if c.lower().startswith("pupil")] 
    if "avg_pupil_size" in lower_cols:
        pupil_like.append(lower_cols["avg_pupil_size"])  # preserve original case
    # De-duplicate while preserving order
    seen = set()
    pupil_cols = []
    for c in pupil_like:
        if c not in seen:
            seen.add(c)
            pupil_cols.append(c)
    if not pupil_cols:
        continue

    stats = {"image_id": image_id}
    for c in pupil_cols:
        s = pd.to_numeric(df[c], errors="coerce")
        n = int(s.count())
        stats[f"{c}_n"] = n
        stats[f"{c}_mean"] = float(s.mean()) if n > 0 else np.nan
        stats[f"{c}_std"] = float(s.std(ddof=1)) if n > 1 else np.nan
        # For normalized series, capture magnitude away from baseline
        if c.lower() == "pupil_size_norm":
            stats[f"{c}_abs_mean"] = float(s.abs().mean()) if n > 0 else np.nan
            stats[f"{c}_rms"] = float(np.sqrt(np.nanmean((s ** 2)))) if n > 0 else np.nan
    pupil_rows.append(stats)

pupil_df = pd.DataFrame(pupil_rows)
# Reduce to per-image by averaging across files if multiple files per image
if not pupil_df.empty:
    agg_map = {col: "mean" for col in pupil_df.columns if col != "image_id"}
    pupil_per_image = pupil_df.groupby("image_id").agg(agg_map).reset_index()
else:
    pupil_per_image = pd.DataFrame(columns=["image_id"])  # empty

pupil_per_image.head()

Unnamed: 0,image_id,pupil_size_norm_n,pupil_size_norm_mean,pupil_size_norm_std,pupil_size_norm_abs_mean,pupil_size_norm_rms,avg_pupil_size_n,avg_pupil_size_mean,avg_pupil_size_std
0,1,23.632653,2.101475e-16,1.0,0.795441,0.970433,23.653061,3.722185,0.171618
1,2,26.204082,-2.377767e-16,1.0,0.781776,0.976193,26.204082,3.705522,0.146082
2,3,23.104167,2.772367e-16,1.0,0.796986,0.974179,23.125,3.83886,0.181887
3,4,27.632653,-7.328157e-17,1.0,0.791707,0.976738,27.632653,3.761114,0.164214
4,5,27.734694,-4.174451e-17,1.0,0.775973,0.975443,27.734694,4.008238,0.203668


### Pupil data computation (overview)
- Inputs (from Tobii export): `left_pupil_diameter`, `right_pupil_diameter`.
- Cleaning: values outside the plausible human range [1.5, 8] mm → NaN; non‑numeric/corrupted entries → NaN.
- Combine eyes per sample:
  - both valid → average(left, right)
  - one valid → use that eye
  - none valid → NaN
- Sanity pass: mask the combined average again if outside [1.5, 8] mm.
- Interpolation: short gaps in `pupil_size` were linearly interpolated (together with x and y) to bridge blinks/dropped samples.
- Output: cleaned, interpolated `pupil_size` alongside x, y, and a millisecond timestamp (relative to the first frame).

Normalization and negatives
- `pupil_size_norm` is a normalized series (e.g., z‑score or baseline‑relative change).
- Per‑fixation means like `pupil_size_norm_mean` can be negative (below baseline/mean) or positive (above).
- For the exact normalization used here, see the code around cell 4 in this notebook.

In [6]:
# Time dynamics: first vs last third mean fixation duration per image
import numpy as np

time_rows = []
for fp in glob.glob(str(fixations_dir / "*.csv")):
    name = os.path.basename(fp)
    m = fname_re.match(name)
    if not m:
        continue
    image_id = str(m.group("image")).zfill(3)
    try:
        df = pd.read_csv(fp)
    except Exception as e:
        print(f"Failed to read {name}: {e}")
        continue
    # Need start_time, end_time, duration; derive end_time if missing
    if not {"start_time","end_time","duration"}.issubset(df.columns):
        if "start_time" in df.columns and "duration" in df.columns:
            df = df.copy()
            df["end_time"] = pd.to_numeric(df["start_time"], errors="coerce") + pd.to_numeric(df["duration"], errors="coerce")
        else:
            continue
    st = pd.to_numeric(df["start_time"], errors="coerce")
    et = pd.to_numeric(df["end_time"], errors="coerce")
    dur = pd.to_numeric(df["duration"], errors="coerce")
    mask_valid = st.notna() & et.notna() & dur.notna()
    if not mask_valid.any():
        continue
    st, et, dur = st[mask_valid], et[mask_valid], dur[mask_valid]
    t0, t1 = st.min(), et.max()
    if not np.isfinite(t0) or not np.isfinite(t1) or t1 <= t0:
        continue
    b1 = t0 + (t1 - t0) / 3.0
    b2 = t0 + 2.0 * (t1 - t0) / 3.0
    mid = (st + et) / 2.0
    first_mask = mid < b1
    last_mask = mid >= b2
    first_mean = float(dur[first_mask].mean()) if first_mask.any() else np.nan
    last_mean = float(dur[last_mask].mean()) if last_mask.any() else np.nan
    n_first = int(first_mask.sum())
    n_last = int(last_mask.sum())
    time_rows.append({
        "image_id": image_id,
        "fix_dur_mean_first_third": first_mean,
        "fix_dur_mean_last_third": last_mean,
        "n_fix_first_third": n_first,
        "n_fix_last_third": n_last,
    })

time_dyn_df = pd.DataFrame(time_rows)
if not time_dyn_df.empty:
    time_dyn_per_image = (
        time_dyn_df.groupby("image_id").agg({
            "fix_dur_mean_first_third": "mean",
            "fix_dur_mean_last_third": "mean",
            "n_fix_first_third": "sum",
            "n_fix_last_third": "sum",
        }).reset_index()
    )
else:
    time_dyn_per_image = pd.DataFrame(columns=[
        "image_id","fix_dur_mean_first_third","fix_dur_mean_last_third","n_fix_first_third","n_fix_last_third"
    ])

time_dyn_per_image.head()

Unnamed: 0,image_id,fix_dur_mean_first_third,fix_dur_mean_last_third,n_fix_first_third,n_fix_last_third
0,1,274.59095,311.45011,406,378
1,2,270.570046,286.048076,440,434
2,3,261.323237,375.535697,415,350
3,4,269.514004,306.053531,477,445
4,5,255.631797,281.616679,480,435


In [7]:
# Join per-image summary with pupil metrics and time dynamics
per_image = per_image_summary.merge(pupil_per_image, on="image_id", how="left")
per_image = per_image.merge(time_dyn_per_image, on="image_id", how="left")

# Build a human-readable version with rounded values and clearer column names
per_image_pretty = per_image.copy()

# Column renames with units where helpful
rename_map = {
    "number_of_fixations": "fixations_total",
    "view_time_total_sum": "view_time_total_sum_ms",
    "fixation_duration_mean_weighted": "fixation_duration_mean_ms",
    "fixation_duration_median_approx": "fixation_duration_median_ms",
    "scanpath_length_mean": "scanpath_length_mean_px",
    "BCEA_68_mean": "bcea68_mean_px2",
    "BCEA_95_mean": "bcea95_mean_px2",
    "primary_label_top": "primary_label",
    "fix_dur_mean_first_third": "fix_dur_mean_first_third_ms",
    "fix_dur_mean_last_third": "fix_dur_mean_last_third_ms",
    "n_fix_first_third": "fixations_first_third",
    "n_fix_last_third": "fixations_last_third",
}
# Also rename common pupil fields if present
if "pupil_size_norm_mean" in per_image_pretty.columns:
    rename_map["pupil_size_norm_mean"] = "pupil_norm_mean"
if "pupil_size_norm_std" in per_image_pretty.columns:
    rename_map["pupil_size_norm_std"] = "pupil_norm_std"
if "pupil_size_norm_abs_mean" in per_image_pretty.columns:
    rename_map["pupil_size_norm_abs_mean"] = "pupil_norm_abs_mean"
if "pupil_size_norm_rms" in per_image_pretty.columns:
    rename_map["pupil_size_norm_rms"] = "pupil_norm_rms"
# avg_pupil_size likely in mm
if "avg_pupil_size_mean" in per_image_pretty.columns:
    rename_map["avg_pupil_size_mean"] = "pupil_mm_mean"
if "avg_pupil_size_std" in per_image_pretty.columns:
    rename_map["avg_pupil_size_std"] = "pupil_mm_std"
per_image_pretty = per_image_pretty.rename(columns=rename_map)

# Round numeric columns to readable precision
round_specs = {
    # durations and times in ms
    "fixation_duration_mean_ms": 1,
    "fixation_duration_median_ms": 1,
    "fix_dur_mean_first_third_ms": 1,
    "fix_dur_mean_last_third_ms": 1,
    "view_time_total_sum_ms": 0,
    # path length and BCEA
    "scanpath_length_mean_px": 1,
    "bcea68_mean_px2": 0,
    "bcea95_mean_px2": 0,
    # normalized pupil
    "pupil_norm_mean": 3,
    "pupil_norm_std": 3,
    "pupil_norm_abs_mean": 3,
    "pupil_norm_rms": 3,
    # raw mm
    "pupil_mm_mean": 3,
    "pupil_mm_std": 3,
}
for col, ndigs in round_specs.items():
    if col in per_image_pretty.columns:
        per_image_pretty[col] = per_image_pretty[col].round(ndigs)

# Ensure counts are integers
for col in ["fixations_total", "fixations_first_third", "fixations_last_third"]:
    if col in per_image_pretty.columns:
        per_image_pretty[col] = per_image_pretty[col].astype("Int64")

# Order columns for readability
preferred_order = [
    "image_id", "primary_label",
    "fixations_total", "fixations_first_third", "fixations_last_third",
    "view_time_total_sum_ms",
    "fixation_duration_mean_ms", "fixation_duration_median_ms",
    "fix_dur_mean_first_third_ms", "fix_dur_mean_last_third_ms",
    "scanpath_length_mean_px", "bcea68_mean_px2", "bcea95_mean_px2",
    # pupil
    "pupil_mm_mean", "pupil_mm_std",
    "pupil_norm_mean", "pupil_norm_std", "pupil_norm_abs_mean", "pupil_norm_rms",
]
cols = [c for c in preferred_order if c in per_image_pretty.columns] + [
    c for c in per_image_pretty.columns if c not in preferred_order
]
per_image_pretty = per_image_pretty[cols]

# Save both detailed and pretty CSVs
precise_csv = nb_dir / "per_image_descriptive_summary.csv"
pretty_csv = nb_dir / "per_image_descriptive_summary_pretty.csv"
per_image.to_csv(precise_csv, index=False)
per_image_pretty.to_csv(pretty_csv, index=False)
print(f"Saved per-image descriptive summary (precise): {precise_csv}")
print(f"Saved per-image descriptive summary (pretty):  {pretty_csv}")

per_image_pretty.head(10)

Saved per-image descriptive summary (precise): c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\descriptive_analysis\per_image_descriptive_summary.csv
Saved per-image descriptive summary (pretty):  c:\Users\SWixforth\Uni\eye-tracking-ai\data_analysis\descriptive_analysis\per_image_descriptive_summary_pretty.csv


Unnamed: 0,image_id,primary_label,fixations_total,fixations_first_third,fixations_last_third,view_time_total_sum_ms,fixation_duration_mean_ms,fixation_duration_median_ms,fix_dur_mean_first_third_ms,fix_dur_mean_last_third_ms,...,bcea68_mean_px2,bcea95_mean_px2,pupil_mm_mean,pupil_mm_std,pupil_norm_mean,pupil_norm_std,pupil_norm_abs_mean,pupil_norm_rms,pupil_size_norm_n,avg_pupil_size_n
0,1,meme,1159,406,378,425375.0,284.3,232.4,274.6,311.5,...,72009.0,189497.0,3.722,0.172,0.0,1.0,0.795,0.97,23.632653,23.653061
1,2,meme,1284,440,434,452560.0,274.6,232.9,270.6,286.0,...,60359.0,158841.0,3.706,0.146,-0.0,1.0,0.782,0.976,26.204082,26.204082
2,3,meme,1110,415,350,411436.0,296.9,245.3,261.3,375.5,...,40009.0,105288.0,3.839,0.182,0.0,1.0,0.797,0.974,23.104167,23.125
3,4,meme,1354,477,445,469790.0,280.2,216.6,269.5,306.1,...,87693.0,230772.0,3.761,0.164,-0.0,1.0,0.792,0.977,27.632653,27.632653
4,5,meme,1359,480,435,460515.0,265.5,216.4,255.6,281.6,...,90696.0,238674.0,4.008,0.204,-0.0,1.0,0.776,0.975,27.734694,27.734694
5,6,meme,1209,440,375,414305.0,260.9,232.9,251.0,287.2,...,106973.0,281508.0,3.499,0.185,0.0,1.0,0.774,0.977,24.653061,24.673469
6,7,meme,1224,436,394,429383.0,278.4,216.7,262.9,292.2,...,79181.0,208370.0,3.636,0.173,0.0,1.0,0.756,0.968,24.979592,24.979592
7,8,meme,1108,378,362,433120.0,318.8,232.6,314.0,358.7,...,97520.0,256633.0,4.016,0.196,0.0,1.0,0.789,0.974,23.083333,23.083333
8,9,meme,1417,512,449,461029.0,262.6,232.7,244.3,273.9,...,111613.0,293717.0,4.057,0.176,-0.0,1.0,0.789,0.98,28.897959,28.918367
9,10,meme,1469,530,474,510862.0,288.1,241.2,275.8,289.6,...,55985.0,147330.0,3.706,0.166,-0.0,1.0,0.797,0.975,29.979592,29.979592


In [6]:
# Quick sanity check tables
print("Per-image counts and durations (head):")
print(per_image[[
    "image_id","number_of_fixations","view_time_total_sum","fixation_duration_mean_weighted","fixation_duration_median_approx"
]].head())

num_cols = [c for c in per_image.columns if per_image[c].dtype != "O" and c != "image_id"]
per_image[num_cols].describe().T

Per-image counts and durations (head):
  image_id  number_of_fixations  view_time_total_sum  \
0      001                 1159           425375.042   
1      002                 1284           452560.267   
2      003                 1110           411435.771   
3      004                 1354           469790.081   
4      005                 1359           460515.114   

   fixation_duration_mean_weighted  fixation_duration_median_approx  
0                       284.329020                        232.36300  
1                       274.577679                        232.92700  
2                       296.863510                        245.25175  
3                       280.162546                        216.58800  
4                       265.496762                        216.43100  


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number_of_fixations,152.0,1413.25,342.0339,944.0,1200.5,1320.5,1570.25,2556.0
view_time_total_sum,152.0,464492.5,66842.95,360408.4,421319.5,447068.2,481220.1,691547.0
fixation_duration_mean_weighted,152.0,267.786,35.58548,196.0005,238.8399,274.5366,295.3214,351.0048
fixation_duration_median_approx,152.0,223.3594,21.55692,182.442,199.9045,232.5068,241.0259,266.217
scanpath_length_mean,152.0,3192.534,718.8735,1933.301,2699.73,3018.405,3587.123,5197.24
BCEA_68_mean,152.0,86384.22,29430.21,19130.83,65209.67,83523.58,103959.4,187075.4
BCEA_95_mean,152.0,227326.9,77447.92,50344.28,171604.4,219798.9,273577.3,492303.7
pupil_size_norm_mean,152.0,-3.2811940000000005e-17,3.92829e-16,-1.111996e-15,-2.525365e-16,-3.605276e-18,2.119864e-16,1.13485e-15
pupil_size_norm_std,152.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
fix_dur_mean_first_third,152.0,253.9455,37.97449,190.8019,216.502,259.955,282.9442,332.8939


## Notes
- All metrics are aggregated per image_id across all participants/files.
- Pupil metrics were computed from raw fixations CSVs and averaged per image.
- Time dynamics (first vs last third) is computed using fixation midpoints within each image’s total viewing span.
- If no pupil columns exist in the fixations files, those columns will be missing (NaN).


In [5]:
# Debug: Check for missing participants
# ==========================================
import glob
import re

# First, check the feature engineering summary source
print("=== FEATURE ENGINEERING SUMMARY CHECK ===")
print(f"Total rows in summary: {len(summary)}")
print(f"Unique participants in summary: {summary['participant'].nunique()}")
print(f"Participant range in summary: {summary['participant'].min()} to {summary['participant'].max()}")

# Check which participants we found
found_participants = set(summary['participant'].dropna().astype(str))
print(f"\nFound participants: {sorted(found_participants, key=int)}")

# The issue: participants are 0-54, but some are missing
# Check which ones in 0-54 range are missing
expected_participants_numeric = set(range(0, 55))  # 0 to 54
found_participants_numeric = set(summary['participant'].dropna().astype(int))
missing_participants_numeric = expected_participants_numeric - found_participants_numeric

if missing_participants_numeric:
    print(f"\nMissing participants (numeric): {sorted(missing_participants_numeric)}")
    print(f"Count missing: {len(missing_participants_numeric)}")
else:
    print("\nNo missing participants in expected range 0-54")

# Check CSV files in fixations folder
print(f"\n=== FIXATIONS FOLDER CHECK ===")
csv_files = list(glob.glob(str(fixations_dir / "*.csv")))
print(f"Total CSV files found: {len(csv_files)}")

# Parse all filenames to see which participants have files
participants_with_files = set()
failed_parsing = []
fname_re = re.compile(r"^P(?P<participant>\d+)_id(?P<image>\d+).+\.csv$")

for fp in csv_files:
    name = os.path.basename(fp)
    m = fname_re.match(name)
    if m:
        participants_with_files.add(m.group("participant").zfill(3))
    else:
        failed_parsing.append(name)

print(f"Participants with CSV files: {len(participants_with_files)}")
print(f"Participants with files: {sorted(participants_with_files)}")

if failed_parsing:
    print(f"\nFiles that failed parsing (first 10): {failed_parsing[:10]}")

# Check which participants have files but missing from summary
missing_from_summary = participants_with_files - found_participants
missing_files = expected_participants - participants_with_files

if missing_from_summary:
    print(f"\nParticipants with files but missing from summary: {sorted(missing_from_summary)}")
if missing_files:
    print(f"\nParticipants missing CSV files entirely: {sorted(missing_files)}")

# Check file counts per participant
print(f"\n=== FILE COUNTS PER PARTICIPANT ===")
file_counts = {}
for fp in csv_files:
    name = os.path.basename(fp)
    m = fname_re.match(name)
    if m:
        p = m.group("participant").zfill(3)
        file_counts[p] = file_counts.get(p, 0) + 1

# Show participants with unusual file counts
expected_files_per_participant = 153  # or whatever you expect
unusual_counts = {p: count for p, count in file_counts.items() if count != expected_files_per_participant}
if unusual_counts:
    print(f"Participants with unusual file counts (expected {expected_files_per_participant}):")
    for p, count in sorted(unusual_counts.items()):
        print(f"  P{p}: {count} files")
else:
    print(f"All participants have {expected_files_per_participant} files")

print(f"\nFile count summary: min={min(file_counts.values()) if file_counts else 0}, max={max(file_counts.values()) if file_counts else 0}, mean={np.mean(list(file_counts.values())) if file_counts else 0:.1f}")

=== FEATURE ENGINEERING SUMMARY CHECK ===
Total rows in summary: 7362
Unique participants in summary: 49
Participant range in summary: 0 to 54

Found participants: ['0', '1', '2', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '18', '19', '20', '21', '23', '24', '25', '26', '27', '28', '29', '30', '31', '33', '34', '35', '36', '37', '38', '39', '40', '41', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54']

Missing participants (numeric): [3, 17, 22, 32, 42, 43]
Count missing: 6

=== FIXATIONS FOLDER CHECK ===
Total CSV files found: 7362
Participants with CSV files: 49
Participants with files: ['000', '001', '002', '004', '005', '006', '007', '008', '009', '010', '011', '012', '013', '014', '015', '016', '018', '019', '020', '021', '023', '024', '025', '026', '027', '028', '029', '030', '031', '033', '034', '035', '036', '037', '038', '039', '040', '041', '044', '045', '046', '047', '048', '049', '050', '051', '052', '053', '054']

Participants w

In [6]:
# Simple missing participants check
print("=== MISSING PARTICIPANTS SUMMARY ===")

# Check which participants 0-54 are missing from feature engineering summary
expected_participants_numeric = set(range(0, 55))  # 0 to 54 (55 total)
found_participants_numeric = set(summary['participant'].dropna().astype(int))
missing_participants_numeric = sorted(expected_participants_numeric - found_participants_numeric)

print(f"Expected participants: 0 to 54 (55 total)")
print(f"Found participants: {len(found_participants_numeric)}")
print(f"Missing participants: {missing_participants_numeric}")
print(f"Count missing: {len(missing_participants_numeric)}")

# Check if missing participants have CSV files
if missing_participants_numeric:
    print(f"\nChecking if missing participants have CSV files...")
    fname_re = re.compile(r"^P(?P<participant>\d+)_id(?P<image>\d+).+\.csv$")
    
    participants_with_files = set()
    for fp in csv_files:
        name = os.path.basename(fp)
        m = fname_re.match(name)
        if m:
            participant_num = int(m.group("participant"))
            participants_with_files.add(participant_num)
    
    missing_have_files = set(missing_participants_numeric) & participants_with_files
    missing_no_files = set(missing_participants_numeric) - participants_with_files
    
    if missing_have_files:
        print(f"Missing participants that HAVE CSV files: {sorted(missing_have_files)}")
        print("→ These should be processed by feature engineering!")
    
    if missing_no_files:
        print(f"Missing participants with NO CSV files: {sorted(missing_no_files)}")
        print("→ These participants don't exist in the data")

# Calculate expected total
expected_total = len(found_participants_numeric) * 153  # 153 images
actual_total = len(summary)
print(f"\nExpected total rows (49 participants × 153 images): {expected_total}")
print(f"Actual total rows: {actual_total}")
print(f"Difference: {actual_total - expected_total}")

=== MISSING PARTICIPANTS SUMMARY ===
Expected participants: 0 to 54 (55 total)
Found participants: 49
Missing participants: [3, 17, 22, 32, 42, 43]
Count missing: 6

Checking if missing participants have CSV files...
Missing participants with NO CSV files: [3, 17, 22, 32, 42, 43]
→ These participants don't exist in the data

Expected total rows (49 participants × 153 images): 7497
Actual total rows: 7362
Difference: -135
