In [1]:
PATH = '/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/HC_h_fusion/hc_fused_readtext.npz'

In [2]:
import re, numpy as np

data = np.load(PATH, allow_pickle=False)
X, y, ids = data["X"], data["y"], data["ids"]

D_TEXT, D_AUDIO, D_CLIP = 768, 768, 512
sl_text  = slice(0, D_TEXT)
sl_audio = slice(D_TEXT, D_TEXT + D_AUDIO)
sl_clip  = slice(D_TEXT + D_AUDIO, D_TEXT + D_AUDIO + D_CLIP)

# patient grouping: split by patient to avoid speaker leakage
patients = np.array([re.match(r"^ID\d+", i).group(0) if re.match(r"^ID\d+", i) else i.split("_")[0] for i in ids])


In [3]:
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, f1_score

def eval_cv(Xsub, y, groups, clf, n_splits=5, seed=42):
    cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    ba, f1 = [], []
    for tr, te in cv.split(Xsub, y, groups=groups):
        clf.fit(Xsub[tr], y[tr])
        yp = clf.predict(Xsub[te])
        ba.append(balanced_accuracy_score(y[te], yp))
        f1.append(f1_score(y[te], yp, average="macro"))
    return float(np.mean(ba)), float(np.std(ba)), float(np.mean(f1)), float(np.std(f1))


In [4]:
svm = make_pipeline(
    StandardScaler(with_mean=True, with_std=True),
    SVC(kernel="rbf", C=1.0, gamma="scale", class_weight="balanced")
)
rf = RandomForestClassifier(
    n_estimators=400, class_weight="balanced", max_depth=None, random_state=42
)


In [5]:
def make_view(X, slices):
    return np.concatenate([X[:, s] for s in slices], axis=1)

ablations = {
    "text_only":   [sl_text],
    "audio_only":  [sl_audio],
    "clip_only":   [sl_clip],
    "text+audio":  [sl_text, sl_audio],
    "text+clip":   [sl_text, sl_clip],
    "audio+clip":  [sl_audio, sl_clip],
    "all_three":   [sl_text, sl_audio, sl_clip],
}

print("=== SVM (RBF) ===")
for name, sls in ablations.items():
    Xsub = make_view(X, sls)
    ba_m, ba_s, f1_m, f1_s = eval_cv(Xsub, y, patients, svm)
    print(f"{name:12s} | BA {ba_m:.3f}±{ba_s:.3f} | F1 {f1_m:.3f}±{f1_s:.3f}")

print("\n=== RandomForest ===")
for name, sls in ablations.items():
    Xsub = make_view(X, sls)
    ba_m, ba_s, f1_m, f1_s = eval_cv(Xsub, y, patients, rf)
    print(f"{name:12s} | BA {ba_m:.3f}±{ba_s:.3f} | F1 {f1_m:.3f}±{f1_s:.3f}")


=== SVM (RBF) ===
text_only    | BA 1.000±0.000 | F1 1.000±0.000
audio_only   | BA 1.000±0.000 | F1 1.000±0.000
clip_only    | BA 1.000±0.000 | F1 1.000±0.000
text+audio   | BA 1.000±0.000 | F1 1.000±0.000
text+clip    | BA 1.000±0.000 | F1 1.000±0.000
audio+clip   | BA 1.000±0.000 | F1 1.000±0.000
all_three    | BA 1.000±0.000 | F1 1.000±0.000

=== RandomForest ===
text_only    | BA 1.000±0.000 | F1 1.000±0.000
audio_only   | BA 1.000±0.000 | F1 1.000±0.000
clip_only    | BA 1.000±0.000 | F1 1.000±0.000
text+audio   | BA 1.000±0.000 | F1 1.000±0.000
text+clip    | BA 1.000±0.000 | F1 1.000±0.000
audio+clip   | BA 1.000±0.000 | F1 1.000±0.000
all_three    | BA 1.000±0.000 | F1 1.000±0.000


In [6]:
rng = np.random.default_rng(123)
perm_bas = []
for i in range(50):  # 50 permutations
    y_perm = rng.permutation(y)
    Xsub = make_view(X, [sl_text, sl_audio, sl_clip])  # all features
    ba_m, _, _, _ = eval_cv(Xsub, y_perm, patients, svm, n_splits=5, seed=42+i)
    perm_bas.append(ba_m)

print("Permutation BA mean±sd over 50 runs:", np.mean(perm_bas), "±", np.std(perm_bas))




Permutation BA mean±sd over 50 runs: 0.5044809523809524 ± 0.11888945388426155


In [7]:
# A) Exact duplicates across samples?
import hashlib
hashes = [hashlib.md5(X[i].tobytes()).hexdigest() for i in range(X.shape[0])]
dups = [ids[i] for i in range(X.shape[0]) if hashes.count(hashes[i]) > 1]
print("Duplicate feature vectors:", sorted(set(dups))[:10])

# B) Do any patients appear in both train and test folds in a manual split?
# (If you are not using GroupKFold elsewhere, switch to it.)


Duplicate feature vectors: []


In [8]:
import re, numpy as np
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, f1_score

# X, y, ids loaded from fused_readtext.npz
# patient id: "ID00_hc_0_0_0" -> "ID00"
patients = np.array([re.match(r"^ID\d+", i).group(0) if re.match(r"^ID\d+", i) else i.split("_")[0] for i in ids])

# one label per patient (majority vote)
pat2ys = {}
for pid, yy in zip(patients, y):
    pat2ys.setdefault(pid, []).append(int(yy))
patients_unique = np.array(sorted(pat2ys.keys()))
patient_labels = np.array([Counter(pat2ys[pid]).most_common(1)[0][0] for pid in patients_unique])

# choose K so every fold can contain both classes
min_per_class = min(np.bincount(patient_labels))
K = min(5, max(2, min_per_class))   # e.g., 3 if classes are small
skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)

svm = make_pipeline(StandardScaler(with_mean=True, with_std=True),
                    SVC(kernel="rbf", C=1.0, gamma="scale", class_weight="balanced"))

def eval_patient_folds(X, y, patients, patients_unique, patient_labels, clf):
    ba, f1 = [], []
    # map sample->patient index
    pid_to_idx = {pid:i for i,pid in enumerate(patients_unique)}
    sample_pat_idx = np.array([pid_to_idx[p] for p in patients])

    for tr_pat, te_pat in skf.split(patients_unique, patient_labels):
        tr_mask = np.isin(sample_pat_idx, tr_pat)
        te_mask = np.isin(sample_pat_idx, te_pat)
        # sanity: both classes present in test
        if len(np.unique(y[te_mask])) < 2:
            # skip or merge with neighbor fold; easiest is to reduce K and rerun
            continue
        clf.fit(X[tr_mask], y[tr_mask])
        yp = clf.predict(X[te_mask])
        ba.append(balanced_accuracy_score(y[te_mask], yp))
        f1.append(f1_score(y[te_mask], yp, average="macro"))
    return np.mean(ba), np.std(ba), np.mean(f1), np.std(f1)

ba_m, ba_s, f1_m, f1_s = eval_patient_folds(X, y, patients, patients_unique, patient_labels, svm)
print(f"SVM (patient-stratified {K}-fold) | BA {ba_m:.3f}±{ba_s:.3f} | F1 {f1_m:.3f}±{f1_s:.3f}")


SVM (patient-stratified 5-fold) | BA 1.000±0.000 | F1 1.000±0.000


In [9]:
rng = np.random.default_rng(123)
perm = []
for i in range(50):
    y_perm = rng.permutation(y)
    ba_m, _, _, _ = eval_patient_folds(X, y_perm, patients, patients_unique, patient_labels, svm)
    perm.append(ba_m)
print("Permutation (patient-folds) BA mean±sd:", np.mean(perm), "±", np.std(perm))


Permutation (patient-folds) BA mean±sd: 0.5207821428571429 ± 0.12440164726677487
