Baseline 

In [None]:
import os
import time
import warnings
import itertools
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, cross_val_score
from sklearn.metrics import accuracy_score

warnings.filterwarnings("ignore")

# =============================================================
# BASELINE-VERSION (praktisch kein Preprocessing)
# - Spaltenauswahl: nur 'Gaze X', 'Gaze Y'
# - Typkonvertierung -> numerisch
# - Fehlende Werte: nur löschen (minimal-invasiv), sonst nichts
# - Keine Outlier-Behandlung, keine Filter, kein Scaling
# - Saubere Evaluation per Group-Splits (pro Datei)
# - Drei Methoden: 80/20 GroupSplit, 5-Fold GroupKFold, OOB
# =============================================================

# ------------------------ Laden ------------------------ #
def to_f32(df: pd.DataFrame) -> pd.DataFrame:
    return df.astype(np.float32, copy=False)


def load_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, decimal=",", low_memory=False)
    # robuste Spaltenwahl
    cols_lc = {c.strip().lower(): c for c in df.columns}
    col_x = cols_lc.get("gaze x", "Gaze X")
    col_y = cols_lc.get("gaze y", "Gaze Y")
    df = df[[col_x, col_y]].copy()
    df = df.apply(pd.to_numeric, errors="coerce")
    # Minimal: fehlende Zeilen konsequent entfernen
    df = df.dropna().reset_index(drop=True)
    return to_f32(df)


def preload_folder(folder: str, target: int):
    out = []
    for fn in os.listdir(folder):
        if fn.lower().endswith(".csv"):
            p = os.path.join(folder, fn)
            try:
                d = load_csv(p)
                if not d.empty:
                    out.append((fn, d, target))
            except Exception as e:
                print(f"⚠️ Fehler beim Laden {p}: {e}")
    return out


# ------------------------ Dataset bauen ------------------------ #
# as_features=True: 1 Zeile pro Datei (sehr schnell, robust, kein Leakage)
# as_features=False: zeilenbasiert (größer, aber weiterhin gruppensicher)

def build_dataset(preloaded_files, as_features: bool = True):
    X_list, y_list, groups = [], [], []

    for (fn, df, target) in preloaded_files:
        if as_features:
            feats = {
                "gx_mean": df.iloc[:, 0].mean(),
                "gx_std": df.iloc[:, 0].std(ddof=1),
                "gy_mean": df.iloc[:, 1].mean(),
                "gy_std": df.iloc[:, 1].std(ddof=1),
                "gx_iqr": (df.iloc[:, 0].quantile(0.75) - df.iloc[:, 0].quantile(0.25)),
                "gy_iqr": (df.iloc[:, 1].quantile(0.75) - df.iloc[:, 1].quantile(0.25)),
            }
            X_sample = pd.DataFrame([feats], index=[0])
            X_list.append(X_sample)
            y_list.append(np.array([target], dtype=np.int8))
            groups.append(fn)
        else:
            X_list.append(df)
            y_list.append(np.full(len(df), target, dtype=np.int8))
            groups.extend([fn] * len(df))

    if not X_list:
        return pd.DataFrame(), np.array([]), []

    X = pd.concat(X_list, ignore_index=True)
    y = np.concatenate(y_list)
    groups = np.array(groups)
    return to_f32(X), y.astype(np.int8), groups


# ------------------------ Evaluation ------------------------ #
def evaluate_three_methods(X: pd.DataFrame, y: np.ndarray, groups: np.ndarray, n_splits: int = 5):
    rows = []

    # RandomForest als solide Baseline; OOB aktiviert (Bootstrap-Ersatz)
    base_clf = RandomForestClassifier(
        n_estimators=200,
        max_features="log2",
        min_samples_leaf=2,
        min_samples_split=2,
        random_state=42,
        n_jobs=-1,
        oob_score=True,
        bootstrap=True,
    )

    # A) 80/20 GroupSplit
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_idx, test_idx in gss.split(X, y, groups):
        clf = base_clf
        clf.fit(X.iloc[train_idx], y[train_idx])
        pred = clf.predict(X.iloc[test_idx])
        acc = accuracy_score(y[test_idx], pred)
        rows.append(["Baseline", "80/20 GroupSplit", float(acc), 0.0])

    # B) GroupKFold
    gkf = GroupKFold(n_splits=n_splits)
    cv_scores = cross_val_score(base_clf, X, y, groups=groups, cv=gkf, n_jobs=-1)
    rows.append(["Baseline", f"{n_splits}-Fold GroupKFold", float(cv_scores.mean()), float(cv_scores.std())])

    # C) OOB-Score
    clf_oob = base_clf
    clf_oob.fit(X, y)
    oob_acc = getattr(clf_oob, "oob_score_", np.nan)
    rows.append(["Baseline", "OOB (Bootstrap-Ersatz)", float(oob_acc), 0.0])

    return rows


# ------------------------ Main ------------------------ #
def main(folder_cheat: str,
         folder_no_cheat: str,
         save_path: str = "baseline_validation.xlsx",
         as_features: bool = True):

    t0 = time.time()
    print("⏳ Lade Daten einmalig…")
    cheat_files = preload_folder(folder_cheat, 1)
    no_cheat_files = preload_folder(folder_no_cheat, 0)
    preloaded = cheat_files + no_cheat_files
    print(f"➡️ Geladen: {len(preloaded)} Dateien (Schummeln: {len(cheat_files)}, Nicht: {len(no_cheat_files)})")

    X, y, groups = build_dataset(preloaded, as_features=as_features)

    if X.empty:
        raise RuntimeError("Keine Daten nach Baseline-Vorbereitung. Prüfe Spaltennamen und CSV-Inhalt.")

    rows = evaluate_three_methods(X, y, groups, n_splits=5)

    df = pd.DataFrame(rows, columns=["Pipeline", "Validation Method", "Mean Accuracy", "Std Deviation"]) 
    df.to_excel(save_path, index=False)
    print(f"✅ Fertig in {((time.time()-t0)/60):.1f} min → {save_path}")


if __name__ == "__main__":
    # >>> Pfade anpassen <<<
    folder_cheat = r"C:\Users\oxije\Dropbox\Dissertation\Experimente\1. Experiment\Originaldaten_split\Schummeln"
    folder_no_cheat = r"C:\Users\oxije\Dropbox\Dissertation\Experimente\1. Experiment\Originaldaten_split\Nicht_Schummeln"


    # as_features=True = empfohlen (1 Zeile pro Datei). Wenn du zeilenbasiert testen willst, setze False.
    main(folder_cheat, folder_no_cheat,
         save_path="baseline_validation.xlsx",
         as_features=True)


Comparison of Validation Methods and Accuracy Evaluation

In [None]:
import os
import time
import itertools
import warnings
import numpy as np
import pandas as pd
from dataclasses import dataclass

from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
    GroupShuffleSplit,
    GroupKFold,
    cross_val_score,
)
from sklearn.metrics import accuracy_score
from scipy.signal import butter, filtfilt

warnings.filterwarnings("ignore")

# =============================================================
#   ZIEL
#   - 3 Validierungsvarianten vergleichen (80/20, K-Fold, "Bootstrap")
#   - Deutlich schneller & sauberer (kein Data Leakage über Zeilen!)
#   - Daten NUR EINMAL laden
#   - OOB-Score als Bootstrap-Ersatz (schnell + statistisch sinnvoll)
# =============================================================

# ------------------------ Utilities ------------------------ #
def to_f32(df: pd.DataFrame) -> pd.DataFrame:
    return df.astype(np.float32, copy=False)


def load_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, decimal=",", low_memory=False)
    # robustes Spalten-Mapping
    candidates = {c.strip().lower(): c for c in df.columns}
    cols = [candidates.get("gaze x"), candidates.get("gaze y")]
    if any(c is None for c in cols):
        # Fallback auf exakte Namen
        cols = ["Gaze X", "Gaze Y"]
    df = df[cols]
    df = df.apply(pd.to_numeric, errors="coerce")
    return to_f32(df)


def preload_folder(folder: str, target: int):
    out = []
    for fn in os.listdir(folder):
        if fn.lower().endswith(".csv"):
            p = os.path.join(folder, fn)
            try:
                df = load_csv(p)
                if not df.empty:
                    out.append((fn, df, target))
            except Exception as e:
                print(f"⚠️ Fehler beim Laden {p}: {e}")
    return out


# ------------------------ Preprocessing ------------------------ #
def handle_missing_values(df: pd.DataFrame, method: str) -> pd.DataFrame:
    if method == "mean":
        return df.fillna(df.mean(numeric_only=True))
    elif method == "locf":
        return df.fillna(method="ffill")
    elif method == "delete":
        return df.dropna()
    return df


def handle_outliers(df: pd.DataFrame, method: str, z_thresh: float = 3.0) -> pd.DataFrame:
    mu = df.mean(numeric_only=True)
    sigma = df.std(numeric_only=True).replace(0, np.nan)
    z = (df - mu) / sigma
    mask = z.abs() > z_thresh

    if method == "mean":
        out = df.copy()
        for c in df.columns:
            if mask[c].any():
                out.loc[mask[c], c] = mu[c]
        return out
    elif method == "locf":
        out = df.copy()
        for c in df.columns:
            col = out[c]
            col = col.mask(mask[c])
            out[c] = col.ffill()
        return out
    elif method == "delete":
        return df[~mask.any(axis=1)]
    return df


def apply_feature_limits(df: pd.DataFrame, limits: bool = True) -> pd.DataFrame:
    if not limits:
        return df
    m = (df.iloc[:, 0].between(0, 1920)) & (df.iloc[:, 1].between(0, 1080))
    return df[m]


def smooth_moving_average(df: pd.DataFrame, on: bool, window: int = 5) -> pd.DataFrame:
    if not on or window <= 1:
        return df
    kernel = np.ones(window, dtype=np.float32) / window
    out = df.copy()
    for c in df.columns:
        out[c] = np.convolve(df[c].values, kernel, mode="same")
    return out


def low_pass_filter(df: pd.DataFrame, on: bool, cutoff: float = 0.1, fs: float = 30.0) -> pd.DataFrame:
    if not on:
        return df
    nyq = 0.5 * fs
    normal_cutoff = max(min(cutoff / nyq, 0.99), 1e-6)
    b, a = butter(1, normal_cutoff, btype="low", analog=False)
    arr = filtfilt(b, a, df.values, axis=0)
    return pd.DataFrame(arr, columns=df.columns, dtype=np.float32)


def normalize(df: pd.DataFrame, method: str) -> pd.DataFrame:
    if method == "minmax":
        scaler = MinMaxScaler()
    elif method == "robust":
        scaler = RobustScaler()
    elif method == "zscore":
        scaler = StandardScaler()
    else:
        return df
    df[df.columns] = scaler.fit_transform(df[df.columns])
    return to_f32(df)


# Eine Datei -> preprocess -> (Option A) Zeilen behalten  |  (Option B) Feature-Vektor pro Datei

def preprocess_one(df: pd.DataFrame,
                   missing_method, outlier_method, normalize_method,
                   feature_limits, smoothing_on, lpf_on,
                   as_features: bool):
    d = df
    d = handle_missing_values(d, missing_method) if missing_method else d
    d = handle_outliers(d, outlier_method) if outlier_method else d
    d = apply_feature_limits(d, feature_limits) if feature_limits is not None else d
    d = smooth_moving_average(d, smoothing_on)
    d = low_pass_filter(d, lpf_on)
    d = normalize(d, normalize_method) if normalize_method else d

    if as_features:
        # Minimal-Feature-Set (extrem schnell). Kann später erweitert werden (Fixationen etc.)
        feats = {
            "gx_mean": d.iloc[:, 0].mean(),
            "gx_std": d.iloc[:, 0].std(ddof=1),
            "gy_mean": d.iloc[:, 1].mean(),
            "gy_std": d.iloc[:, 1].std(ddof=1),
            "gx_iqr": (d.iloc[:, 0].quantile(0.75) - d.iloc[:, 0].quantile(0.25)),
            "gy_iqr": (d.iloc[:, 1].quantile(0.75) - d.iloc[:, 1].quantile(0.25)),
        }
        return pd.DataFrame([feats])
    else:
        return d.reset_index(drop=True)


def build_dataset(preloaded_files,
                  missing_method, outlier_method, normalize_method,
                  feature_limits, smoothing_on, lpf_on,
                  as_features: bool):
    X_list, y_list, groups = [], [], []
    for (fn, df, target) in preloaded_files:
        d = preprocess_one(df, missing_method, outlier_method, normalize_method,
                           feature_limits, smoothing_on, lpf_on, as_features)
        if d.empty:
            continue
        X_list.append(d)
        y_list.append(np.full(len(d), target, dtype=np.int8))
        # group = Dateiname, damit Splits nie Zeilen derselben Datei in Train/Test mischen
        groups.extend([fn] * len(d))

    if not X_list:
        return pd.DataFrame(), np.array([]), []

    X = pd.concat(X_list, ignore_index=True)
    y = np.concatenate(y_list)
    return to_f32(X), y.astype(np.int8), np.array(groups)


# ------------------------ Evaluation ------------------------ #
@dataclass
class EvalResult:
    combo: tuple
    method: str
    mean_acc: float
    std_acc: float


def eval_all(X, y, groups, n_splits=5):
    results = []

    # Modell: parallel, OOB für "Bootstrap"-Ersatz
    base_clf = RandomForestClassifier(
        n_estimators=200,
        max_features="log2",
        min_samples_leaf=2,
        min_samples_split=2,
        random_state=42,
        n_jobs=-1,
        oob_score=True,
        bootstrap=True,
    )

    # A) 80/20 GroupSplit (kein Leakage)
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_idx, test_idx in gss.split(X, y, groups):
        clf = base_clf
        clf.fit(X.iloc[train_idx], y[train_idx])
        pred = clf.predict(X.iloc[test_idx])
        acc = accuracy_score(y[test_idx], pred)
        results.append(EvalResult("", "80/20 GroupSplit", float(acc), 0.0))

    # B) GroupKFold CV
    gkf = GroupKFold(n_splits=n_splits)
    # cross_val_score clont intern das Estimator-Objekt; n_jobs=-1 parallelisiert die Folds
    cv_scores = cross_val_score(base_clf, X, y, groups=groups, cv=gkf, n_jobs=-1)
    results.append(EvalResult("", f"{n_splits}-Fold GroupKFold", float(cv_scores.mean()), float(cv_scores.std())))

    # C) "Bootstrapping": OOB-Accuracy (schnell, nahe .632-Bootstrap)
    # Ein erneutes Fit auf ALLEN Daten, OOB-Samples dienen als Test
    clf_oob = base_clf
    clf_oob.fit(X, y)
    oob_acc = getattr(clf_oob, "oob_score_", np.nan)
    results.append(EvalResult("", "OOB (Bootstrap-Ersatz)", float(oob_acc), 0.0))

    return results


# ------------------------ Hauptprogramm ------------------------ #
def main(folder_cheat: str,
         folder_no_cheat: str,
         save_path: str = "validation_comparison_optimized.xlsx",
         as_features: bool = True,  # True = extrem schnell & kein Leakage; False = zeilenbasiert
         save_every: int = 20):

    t0 = time.time()
    print("⏳ Lade Daten einmalig…")
    cheat_files = preload_folder(folder_cheat, 1)
    no_cheat_files = preload_folder(folder_no_cheat, 0)
    preloaded = cheat_files + no_cheat_files
    print(f"➡️ Geladen: {len(preloaded)} Dateien (Schummeln: {len(cheat_files)}, Nicht: {len(no_cheat_files)})")

    # Grid
    missing_methods = ["mean", "locf", "delete"]
    outlier_methods = ["mean", "locf", "delete"]
    normalize_methods = ["minmax", "robust", "zscore"]
    feature_limits_options = [True, False]
    smoothing_options = [True, False]
    filter_options = [True, False]

    combos = list(itertools.product(
        missing_methods, outlier_methods, normalize_methods,
        feature_limits_options, smoothing_options, filter_options
    ))

    rows = []

    for i, combo in enumerate(combos, start=1):
        (missing_method, outlier_method, normalize_method,
         feature_limits, smoothing_on, lpf_on) = combo

        X, y, groups = build_dataset(preloaded,
                                     missing_method, outlier_method, normalize_method,
                                     feature_limits, smoothing_on, lpf_on,
                                     as_features=as_features)

        if X.empty or np.isnan(X.values).any():
            print(f"⚠️ Skip {combo} (leer/NaN)")
            rows.append([combo, "80/20 GroupSplit", None, None])
            rows.append([combo, "5-Fold GroupKFold", None, None])
            rows.append([combo, "OOB (Bootstrap-Ersatz)", None, None])
        else:
            res = eval_all(X, y, groups, n_splits=5)
            for r in res:
                rows.append([combo, r.method, r.mean_acc, r.std_acc])

        if i % save_every == 0:
            df_partial = pd.DataFrame(rows, columns=["Preprocessing Combination", "Validation Method", "Mean Accuracy", "Std Deviation"]) 
            tmp = save_path.replace(".xlsx", "_partial.xlsx")
            df_partial.to_excel(tmp, index=False)
            print(f"💾 Zwischenspeicher nach {i}/{len(combos)} Kombos → {tmp}")

    df = pd.DataFrame(rows, columns=["Preprocessing Combination", "Validation Method", "Mean Accuracy", "Std Deviation"]) 
    df.to_excel(save_path, index=False)
    print(f"✅ Fertig in {((time.time()-t0)/60):.1f} min → {save_path}")


if __name__ == "__main__":
    # >>> Pfade anpassen <<<
    folder_cheat = r"C:\\Users\\oxije\\Dropbox\\Dissertation\\Experimente\\1. Experiment\\Originaldaten_split\\Schummeln"
    folder_no_cheat = r"C:\\Users\\oxije\\Dropbox\\Dissertation\\Experimente\\1. Experiment\\Originaldaten_split\\Nicht_Schummeln"

    main(folder_cheat, folder_no_cheat,
         save_path="validation_comparison_optimized.xlsx",
         as_features=True,   # True EMPFOHLEN (deutlich schneller + saubere Evaluation)
         save_every=20)
