In [2]:
# xgb_rowlevel_push74.py
# Row-level XGBoost, fitur makin kaya:
# - rolling variability (x,y,step) + circular stats (cos/sin Δangle, MRL, circular variance)
# - entropy Δangle & step
# - histogram fitur (Δangle 8 bin, step kuantil global 8–9 bin) -> nilai & proporsi
# - rasio small/large saccade, rate langkah besar
# - rolling quantiles & straightness, bbox
# - auto-scaler selection (none/standard/robust/quantile)
# NOTE: kompatibel XGBoost lama (tanpa early stopping / callbacks)

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from xgboost import XGBClassifier

# =========================
# CONFIG
# =========================
CSV_PATH = "truncated_dataset-seqglo.csv"  # ganti kalau perlu
ROLL_W = 21                # coba 15/21/25
USE_SUBSAMPLE = True       # False = full training (lebih lama, bisa >1–2% naik)
SUBSAMPLE_N = 140_000      # total train rows (imbang). Naikin jika kuat.
RANDOM_STATE = 42

SCALE_MODES = ["none", "standard", "robust", "quantile"]

XGB_KW = dict(
    objective="binary:logistic",
    n_estimators=750,      # 650–900 (kalau dinaikkan, turunkan lr)
    learning_rate=0.055,   # 0.05–0.07
    max_depth=5,
    min_child_weight=8,
    gamma=0.25,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=0.6,
    reg_lambda=2.2,
    tree_method="hist",
    random_state=RANDOM_STATE,
    n_jobs=4,
)

# =========================
# Utils
# =========================
def entropy_hist(a, bins_edges):
    a = np.asarray(a)
    a = a[np.isfinite(a)]
    if a.size == 0:
        return 0.0
    hist, _ = np.histogram(a, bins=bins_edges)
    s = hist.sum()
    if s == 0:
        return 0.0
    p = hist.astype(float) / s
    p = p[p > 0]
    return float(-(p * np.log(p)).sum())

def hist_feats(a, bins_edges, prefix):
    """Return dict: counts, proportions for each bin (except last edge), plus sum."""
    hist, _ = np.histogram(a, bins=bins_edges)
    total = hist.sum() + 1e-9
    d = {}
    for i, c in enumerate(hist):
        d[f"{prefix}_bin{i}"] = float(c)
        d[f"{prefix}_p{i}"] = float(c) / total
    d[f"{prefix}_sum"] = float(total)
    return d

# =========================
# 1) Load & pre-bins (global)
# =========================
df = pd.read_csv(CSV_PATH).sort_values(["nama", "time"]).reset_index(drop=True)

# global deltas untuk bikin step bins stabil
dx_all = df["gazeX"].diff()
dy_all = df["gazeY"].diff()
mask_new_subject = df["nama"].ne(df["nama"].shift(1))
dx_all = dx_all.mask(mask_new_subject, 0.0)
dy_all = dy_all.mask(mask_new_subject, 0.0)
step_all = np.sqrt(dx_all**2 + dy_all**2).fillna(0.0).values

# kuantil global untuk step bins (8–9 bin)
q = np.quantile(step_all, [0.00, 0.10, 0.25, 0.50, 0.75, 0.90, 0.97, 0.995, 1.00])
step_bins = np.unique(q)
if step_bins.size < 6:
    step_bins = np.linspace(float(step_all.min()), float(step_all.max()+1e-6), 9)

# delta-angle bins (8 bin rata di [-pi, pi])
dang_bins = np.linspace(-np.pi, np.pi, 9)

# threshold small/large saccade (pakai kuantil global)
small_thr = np.quantile(step_all, 0.25)  # saccade kecil
large_thr = np.quantile(step_all, 0.90)  # saccade besar

# =========================
# 2) Feature Engineering
# =========================
def add_roll_feats(g, w=21):
    g = g.copy()

    dx = g["gazeX"].diff()
    dy = g["gazeY"].diff()
    dx.iloc[0] = 0.0; dy.iloc[0] = 0.0
    step = np.sqrt(dx**2 + dy**2)

    ang = np.arctan2(dy, dx)
    d_ang = np.diff(ang, prepend=ang.iloc[0])
    d_ang = (d_ang + np.pi) % (2*np.pi) - np.pi

    # circular stats (rolling)
    cos_da = np.cos(d_ang)
    sin_da = np.sin(d_ang)
    r_mean_cos = pd.Series(cos_da, index=g.index).rolling(w, min_periods=1).mean()
    r_mean_sin = pd.Series(sin_da, index=g.index).rolling(w, min_periods=1).mean()
    r_mrl = np.sqrt(r_mean_cos**2 + r_mean_sin**2)
    r_circ_var = 1.0 - r_mrl

    # rolling basic stats
    g["r_std_x"] = g["gazeX"].rolling(w, min_periods=1).std()
    g["r_std_y"] = g["gazeY"].rolling(w, min_periods=1).std()
    g["r_mean_step"] = step.rolling(w, min_periods=1).mean()
    g["r_std_step"]  = step.rolling(w, min_periods=1).std()
    g["r_mean_abs_dang"] = pd.Series(np.abs(d_ang), index=g.index).rolling(w, min_periods=1).mean()
    g["r_std_dang"] = pd.Series(d_ang, index=g.index).rolling(w, min_periods=1).std()
    try:
        g["r_skew_step"] = pd.Series(step, index=g.index).rolling(w, min_periods=1).skew()
        g["r_kurt_step"] = pd.Series(step, index=g.index).rolling(w, min_periods=1).kurt()
    except Exception:
        g["r_skew_step"] = 0.0
        g["r_kurt_step"] = 0.0

    # rolling quantiles
    g["r_q25_step"] = pd.Series(step, index=g.index).rolling(w, min_periods=1).quantile(0.25)
    g["r_q75_step"] = pd.Series(step, index=g.index).rolling(w, min_periods=1).quantile(0.75)

    # straightness & bbox
    disp_x = g["gazeX"] - g["gazeX"].shift(w-1)
    disp_y = g["gazeY"] - g["gazeY"].shift(w-1)
    disp = np.sqrt(disp_x**2 + disp_y**2)
    path_w = pd.Series(step, index=g.index).rolling(w, min_periods=1).sum() + 1e-6
    g["r_straight_ratio"] = (disp / path_w).fillna(0)
    g["bbox_w"] = g["gazeX"].rolling(w, min_periods=1).max() - g["gazeX"].rolling(w, min_periods=1).min()
    g["bbox_h"] = g["gazeY"].rolling(w, min_periods=1).max() - g["gazeY"].rolling(w, min_periods=1).min()

    # entropy (rolling)
    g["r_entropy_dang"] = pd.Series(d_ang, index=g.index).rolling(w, min_periods=1).apply(
        lambda a: entropy_hist(a, dang_bins), raw=True
    )
    g["r_entropy_step"] = pd.Series(step, index=g.index).rolling(w, min_periods=1).apply(
        lambda a: entropy_hist(a, step_bins), raw=True
    )

    # histogram features (rolling) – gunakan pusat window (i) ambil range i-w+1..i
    # untuk efisiensi, pakai rolling apply raw dan kembalikan statistik ringkas:
    def roll_hist_features(series, bins, pref):
        # apply mengembalikan satu angka, jadi kita bikin beberapa kali untuk each bin/prop → mahal.
        # Alternatif: hitung di akhir per-baris (lebih simpel, sedikit lebih lambat tapi oke).
        arr = []
        s = series.values
        n = len(s)
        for i in range(n):
            i0 = max(0, i - w + 1)
            win = s[i0:i+1]
            d = hist_feats(win, bins, pref)
            arr.append(d)
        return pd.DataFrame(arr, index=series.index)

    # Δangle histogram
    dang_hist_df = roll_hist_features(pd.Series(d_ang, index=g.index), dang_bins, "h_dang")
    # step histogram
    step_hist_df = roll_hist_features(pd.Series(step, index=g.index), step_bins, "h_step")

    # gabung ke g
    g = pd.concat([g, dang_hist_df, step_hist_df], axis=1)

    # small/large saccade ratios & rates
    small_mask = (step <= small_thr).astype(float)
    large_mask = (step >= large_thr).astype(float)
    g["r_rate_small"] = pd.Series(small_mask, index=g.index).rolling(w, min_periods=1).mean()
    g["r_rate_large"] = pd.Series(large_mask, index=g.index).rolling(w, min_periods=1).mean()
    g["r_ratio_small_large"] = (g["r_rate_small"] / (g["r_rate_large"] + 1e-6)).replace([np.inf, -np.inf], 0.0)

    # base deltas
    g["dx"] = dx
    g["dy"] = dy
    g["abs_dx"] = dx.abs()
    g["abs_dy"] = dy.abs()
    g["step"] = step

    # circular bundle
    g["r_mean_cos_dang"] = r_mean_cos
    g["r_mean_sin_dang"] = r_mean_sin
    g["r_mrl_dang"] = r_mrl
    g["r_circvar_dang"] = r_circ_var

    return g

# compat include_groups
try:
    df = df.groupby("nama", group_keys=False).apply(add_roll_feats, include_groups=False, w=ROLL_W)
except TypeError:
    df = df.groupby("nama", group_keys=False).apply(add_roll_feats, w=ROLL_W)

# bersihin NaN/Inf
for col in df.columns:
    if col in ["nama", "time", "gazeX", "gazeY", "label"]:
        continue
    med = pd.to_numeric(df[col], errors="coerce").replace([np.inf, -np.inf], np.nan).median()
    df[col] = pd.to_numeric(df[col], errors="coerce").replace([np.inf, -np.inf], np.nan).fillna(med)

# =========================
# 3) Features & Label
# =========================
# pilih semua kolom numerik kecuali yang dilarang
exclude = {"nama", "label"}
feature_cols = [c for c in df.columns if c not in exclude]
X = df[feature_cols].astype(float).copy()
y = (df["label"] == 2).astype(int).values

# =========================
# 4) Split
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# subsample train imbang (opsional)
if USE_SUBSAMPLE:
    rng = np.random.RandomState(RANDOM_STATE)
    ix_pos = np.where(y_train == 1)[0]
    ix_neg = np.where(y_train == 0)[0]
    n_each = min(SUBSAMPLE_N // 2, len(ix_pos), len(ix_neg))
    sel = np.concatenate([
        rng.choice(ix_pos, n_each, replace=False),
        rng.choice(ix_neg, n_each, replace=False),
    ])
    X_train_small = X_train.iloc[sel].reset_index(drop=True)
    y_train_small = y_train[sel]
else:
    X_train_small = X_train.reset_index(drop=True)
    y_train_small = y_train

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_small, y_train_small, test_size=0.15, stratify=y_train_small, random_state=RANDOM_STATE
)

# =========================
# 5) Scaler selection
# =========================
def fit_transform_scaler(mode, Xtr, Xva):
    if mode == "none":
        return Xtr.values, Xva.values, None
    elif mode == "standard":
        sc = StandardScaler()
    elif mode == "robust":
        sc = RobustScaler()
    elif mode == "quantile":
        sc = QuantileTransformer(output_distribution="normal", random_state=RANDOM_STATE, subsample=300_000)
    else:
        raise ValueError("unknown scaler")
    Xtr_s = sc.fit_transform(Xtr)
    Xva_s = sc.transform(Xva)
    return Xtr_s, Xva_s, sc

best = dict(mode=None, thr=0.5, acc=-1.0, auc=None, model=None, scaler=None)
for mode in SCALE_MODES:
    Xtr_s, Xva_s, scaler = fit_transform_scaler(mode, X_tr, X_val)
    clf = XGBClassifier(**XGB_KW)
    clf.fit(Xtr_s, y_tr)

    proba_val = clf.predict_proba(Xva_s)[:, 1]
    # grid rapat sekitar 0.5
    ths = np.arange(0.40, 0.601, 0.0015)
    local_best_acc, local_best_thr = -1.0, 0.5
    for t in ths:
        acc_val = accuracy_score(y_val, (proba_val >= t).astype(int))
        if acc_val > local_best_acc:
            local_best_acc, local_best_thr = acc_val, float(t)

    try:
        auc_val = roc_auc_score(y_val, proba_val)
    except Exception:
        auc_val = None

    if local_best_acc > best["acc"]:
        best.update(dict(mode=mode, thr=local_best_thr, acc=local_best_acc, auc=auc_val, model=clf, scaler=scaler))

print(f"[VAL] Best scaler: {best['mode']} | Acc: {best['acc']:.4f} | AUC: {best['auc'] if best['auc'] is not None else 'nan'} | thr={best['thr']:.3f}")

def apply_scaler(scaler, X):
    return X.values if scaler is None else scaler.transform(X)

# =========================
# 6) Test eval
# =========================
X_test_s = apply_scaler(best["scaler"], X_test)
proba_test = best["model"].predict_proba(X_test_s)[:, 1]
preds_test = (proba_test >= best["thr"]).astype(int)

acc = accuracy_score(y_test, preds_test)
try:
    auc = roc_auc_score(y_test, proba_test)
except Exception:
    auc = float("nan")
cm = confusion_matrix(y_test, preds_test)
report = classification_report(y_test, preds_test, target_names=["Sequential(1)","Random(2)"], digits=4)

print("\n========== TEST ==========")
print(f"Best threshold: {best['thr']:.3f}")
print(f"Accuracy: {acc:.4f}")
print(f"ROC AUC : {auc:.4f}")
print("Confusion matrix [[TN, FP], [FN, TP]]:")
print(cm)
print("\nClassification report:")
print(report)

# (opsional) simpan model
# from pathlib import Path
# best["model"].get_booster().save_model("xgb_rowlevel_push74.json")


[VAL] Best scaler: robust | Acc: 0.7525 | AUC: 0.8318447845804988 | thr=0.498

Best threshold: 0.498
Accuracy: 0.7514
ROC AUC : 0.8337
Confusion matrix [[TN, FP], [FN, TP]]:
[[18606  6426]
 [ 6019 19013]]

Classification report:
               precision    recall  f1-score   support

Sequential(1)     0.7556    0.7433    0.7494     25032
    Random(2)     0.7474    0.7595    0.7534     25032

     accuracy                         0.7514     50064
    macro avg     0.7515    0.7514    0.7514     50064
 weighted avg     0.7515    0.7514    0.7514     50064

