In [1]:
# ==========================================
# 1. Setup
# ==========================================
import os
import re
import glob
import math
from pathlib import Path
from typing import Dict, Optional

import numpy as np
import pandas as pd

pd.options.display.float_format = "{:,.4f}".format

In [2]:
# ==========================================
# 2. Konfiguration
# ==========================================
from pathlib import Path

# Try common relative locations and an explicit project-root path
cwd = Path.cwd()
candidates = [
    cwd / "fixations",
    cwd.parent / "fixations",
    cwd.parent.parent / "fixations",
    Path("../fixations"),
    Path("../../fixations"),
    Path("./../fixations"),
    Path("./fixations"),
    Path(r"c:\Users\SWixforth\Uni\eye-tracking-ai\fixations"),
]

DEFAULT_DIRS = [str(p) for p in candidates]
data_dir = None
for d in DEFAULT_DIRS:
    if os.path.isdir(d):
        data_dir = d
        break

# Fallback: walk up a few parents to find a 'fixations' folder
if data_dir is None:
    cur = cwd
    for _ in range(6):
        candidate = cur / "fixations"
        if candidate.is_dir():
            data_dir = str(candidate)
            break
        cur = cur.parent

if data_dir is None:
    raise FileNotFoundError("Kein Datenordner 'fixations' gefunden. Bitte Pfad anpassen.")

output_dir = "./"
os.makedirs(output_dir, exist_ok=True)
print(data_dir)
# Label-Reihenfolge
BINARY_LABEL_ORDER = ["meme", "ort", "person", "politik", "text"]

# Regex zum Parsen von Dateinamen: P000_id001_meme_10000.csv
FNAME_RE = re.compile(r"^P(?P<participant>\d+)_id(?P<image>\d+)(?:_(?P<basecat>[A-Za-z]+))?(?:_(?P<bin>[01]+))?\.csv$")

summary_csv = os.path.join(output_dir, "feature_engineering_summary.csv")
print(f"Using data_dir={data_dir}\nOutput: {summary_csv}")


c:\Users\SWixforth\Uni\eye-tracking-ai\fixations
Using data_dir=c:\Users\SWixforth\Uni\eye-tracking-ai\fixations
Output: ./feature_engineering_summary.csv


In [3]:
# ==========================================
# 3. Hilfsfunktionen
# ==========================================
def parse_filename_meta(fname: str) -> Dict[str, Optional[str]]:
    name = os.path.basename(fname)
    m = FNAME_RE.match(name)
    meta = {"participant": None, "image_id": None, "base_category": None, "binary_code": None}
    if m:
        meta["participant"] = m.group("participant")
        meta["image_id"] = m.group("image")
        bc = m.group("basecat")
        meta["base_category"] = bc.lower() if bc else None
        meta["binary_code"] = m.group("bin")
    return meta

def labels_from_binary_code(code: Optional[str]) -> Dict[str, int]:
    out = {k: 0 for k in BINARY_LABEL_ORDER}
    if not code:
        return out
    bits = list(code.strip())
    if len(bits) >= len(BINARY_LABEL_ORDER):
        bits = bits[-len(BINARY_LABEL_ORDER):]
    else:
        bits = ["0"] * (len(BINARY_LABEL_ORDER) - len(bits)) + bits
    for lbl, b in zip(BINARY_LABEL_ORDER, bits):
        out[lbl] = int(b)
    return out

def labels_from_weight_cols(df: pd.DataFrame) -> Dict[str, float]:
    weights = {}
    for lbl in BINARY_LABEL_ORDER:
        col = f"weight_{lbl}"
        if col in df.columns:
            val = df[col].dropna()
            weights[lbl] = float(val.iloc[0]) if len(val) else float("nan")
        else:
            weights[lbl] = float("nan")
    return weights

def pick_primary_label(base_category: Optional[str], bin_labels: Dict[str, int], weight_labels: Dict[str, float]) -> Optional[str]:
    if base_category in BINARY_LABEL_ORDER:
        return base_category
    best_lbl, best_w = None, -np.inf
    for lbl, w in weight_labels.items():
        if pd.notna(w) and w > best_w:
            best_lbl, best_w = lbl, w
    if best_lbl is not None and best_w != -np.inf and pd.notna(best_w):
        return best_lbl
    for lbl in BINARY_LABEL_ORDER:
        if bin_labels.get(lbl, 0) == 1:
            return lbl
    return None

def bcea(x: np.ndarray, y: np.ndarray, p: float = 0.68) -> float:
    if len(x) < 2 or len(y) < 2:
        return float("nan")
    sx, sy = np.std(x, ddof=1), np.std(y, ddof=1)
    if sx == 0 or sy == 0:
        return float("nan")
    rho = np.corrcoef(x, y)[0, 1]
    rho = 0.0 if np.isnan(rho) else rho
    k = 3.0 if p >= 0.95 else 1.14
    return 2 * math.pi * k * sx * sy * math.sqrt(max(0.0, 1 - rho**2))

def compute_scanpath_length(xs: np.ndarray, ys: np.ndarray) -> float:
    if len(xs) < 2:
        return 0.0
    return float(np.sum(np.sqrt(np.diff(xs)**2 + np.diff(ys)**2)))

def compute_image_level_features(df: pd.DataFrame, fname: str) -> Dict:
    if df.empty:
        return {"file": os.path.basename(fname), "n_fix": 0}

    weight_labels = labels_from_weight_cols(df)

    xs = df["x"].astype(float).values if "x" in df.columns else np.array([])
    ys = df["y"].astype(float).values if "y" in df.columns else np.array([])
    durs = df["duration"].astype(float).values if "duration" in df.columns else np.array([])

    start_min = float(df["start_time"].min()) if "start_time" in df.columns else float("nan")
    end_max = float(df["end_time"].max()) if "end_time" in df.columns else float("nan")
    view_time_total = float(max(0.0, end_max - start_min)) if not (math.isnan(end_max) or math.isnan(start_min)) else float("nan")

    n_fix = int(len(df))
    sum_dur = float(np.nansum(durs)) if len(durs) else float("nan")
    mean_dur = float(np.nanmean(durs)) if len(durs) else float("nan")
    median_dur = float(np.nanmedian(durs)) if len(durs) else float("nan")

    out = {
        "file": os.path.basename(fname),
        "n_fix": n_fix,
        "view_time_total": view_time_total,
        "sum_fix_dur": sum_dur,
        "fix_dur_mean": mean_dur,
        "fix_dur_median": median_dur,
        "scanpath_length": compute_scanpath_length(xs, ys),
        "bcea_68": bcea(xs, ys, p=0.68),
        "bcea_95": bcea(xs, ys, p=0.95),
    }

    meta = parse_filename_meta(fname)
    bin_labels = labels_from_binary_code(meta.get("binary_code"))
    primary_label = pick_primary_label(meta.get("base_category"), bin_labels, weight_labels)

    out["participant"] = meta.get("participant")
    out["image_id"] = meta.get("image_id")
    out["primary_label"] = primary_label

    return out


In [4]:
# ==========================================
# 4. Dateien einlesen und Features berechnen
# ==========================================
csv_files = sorted(glob.glob(os.path.join(data_dir, "*.csv")))
print(f"Found {len(csv_files)} CSV files")

rows = []
for fp in csv_files:
    try:
        df = pd.read_csv(fp)
        rows.append(compute_image_level_features(df, fp))
    except Exception as e:
        print(f"Fehler in {fp}: {e}")

summary = pd.DataFrame(rows)
summary.head()


Found 7362 CSV files


Unnamed: 0,file,n_fix,view_time_total,sum_fix_dur,fix_dur_mean,fix_dur_median,scanpath_length,bcea_68,bcea_95,participant,image_id,primary_label
0,P000_id001_meme_10000.csv,14,5640.854,4393.577,313.8269,282.8405,2586.832,46378.1194,122047.6827,0,1,meme
1,P000_id002_meme_10000.csv,19,6955.267,6373.755,335.4608,249.597,1841.8719,29754.6381,78301.6793,0,2,meme
2,P000_id003_meme_10000.csv,14,5940.504,5241.231,374.3736,307.7835,2159.6009,46590.8309,122607.4496,0,3,meme
3,P000_id004_meme_10000.csv,19,6223.444,5025.302,264.4896,232.967,2623.9104,77827.9004,204810.2641,0,4,meme
4,P000_id005_meme_10000.csv,18,5458.003,4842.601,269.0334,224.6735,2371.0968,60326.9096,158755.0254,0,5,meme


In [5]:

# ==========================================
# 5. Ergebnis speichern
# ==========================================
summary.to_csv(summary_csv, index=False)
print(f"Gespeichert unter: {summary_csv}")
summary.describe(include="all").transpose()


Gespeichert unter: ./feature_engineering_summary.csv


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
file,7362.0,7362.0,P000_id001_meme_10000.csv,1.0,,,,,,,
n_fix,7362.0,,,,29.1788,12.8863,1.0,19.0,27.0,38.0,67.0
view_time_total,7362.0,,,,9590.173,3777.8584,115.432,6355.982,8469.34,14343.6272,15142.61
sum_fix_dur,7362.0,,,,7649.8224,3371.1205,115.432,5093.3483,6776.15,10703.772,14626.72
fix_dur_mean,7362.0,,,,267.7501,77.3289,115.429,218.1892,255.9649,301.6773,1595.7598
fix_dur_median,7362.0,,,,225.4443,51.442,115.429,191.698,216.463,249.601,948.39
scanpath_length,7362.0,,,,3192.8585,1422.2452,0.0,2189.5847,3003.0312,4075.4114,10622.9587
bcea_68,7320.0,,,,86419.6669,42640.4296,0.0,53701.8835,83703.3933,112924.3429,286166.6688
bcea_95,7320.0,,,,227420.1761,112211.6568,0.0,141320.746,220272.0876,297169.3234,753070.1811
participant,7362.0,49.0,000,152.0,,,,,,,


### Interpreting the summary.describe(include="all").transpose() output

This table shows descriptive statistics per feature (each row is one original column from `summary`).

- Numeric columns include:
  - count: number of files with a non-missing value
  - mean, std: average and standard deviation across files
  - min, 25%, 50% (median), 75%, max: distribution quantiles and extremes

- Non-numeric (object/category) columns include:
  - count: number of files (non-missing)
  - unique: number of distinct values
  - top: most frequent value
  - freq: frequency of the most frequent value

- Not applicable statistics appear as NaN.

Notes for key features in this notebook:
- n_fix: number of fixations in a file (rows per CSV).
- view_time_total: end_time − start_time (total viewing time per file).
- sum_fix_dur, fix_dur_mean, fix_dur_median: sum/mean/median of fixation durations in the file.
- scanpath_length: total path length across successive fixations (in the same units as x/y).
- bcea_68, bcea_95: bivariate confidence ellipse area (approx. 68% and 95%).
- file, participant, image_id, primary_label: categorical metadata summarized with top/unique/freq.

Tip: Sort rows by a statistic of interest (e.g., click the header for `mean`) to see which features are highest or most variable.