# This is the method I use for fusion now

In [1]:
import os
import re
import numpy as np
import glob

In [8]:
import os, re, numpy as np

def read_filenames_any_encoding(path):
    # Read raw bytes
    with open(path, "rb") as f:
        data = f.read()
    # BOM sniff
    if data.startswith(b"\xff\xfe"):
        enc = "utf-16-le"
    elif data.startswith(b"\xfe\xff"):
        enc = "utf-16-be"
    elif data.startswith(b"\xef\xbb\xbf"):
        enc = "utf-8-sig"
    else:
        # try utf-8, else fall back to latin-1
        try:
            data.decode("utf-8")
            enc = "utf-8"
        except UnicodeDecodeError:
            enc = "latin-1"
    text = data.decode(enc, errors="strict")
    return [ln.strip() for ln in text.splitlines() if ln.strip()]


# Fusing all embeddings for ReadText

In [12]:
import os, re, glob, numpy as np

# ---- paths (ReadText only) ----
HC_READTEXT_NPZ = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/h_graph_selfattn/HC_ReadText_Spectrogram_CLIP_features_h_graph_selfattn.npz"
HC_READTEXT_TXT = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/CLIP_Spectrogram_embeddings/HC_ReadText_Spectrogram_CLIP_filenames.txt"
PD_READTEXT_NPZ = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/h_graph_selfattn/PD_ReadText_Spectrogram_CLIP_features_h_graph_selfattn.npz"
PD_READTEXT_TXT = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/CLIP_Spectrogram_embeddings/PD_ReadText_Spectrogram_CLIP_filenames.txt"

HC_TEXT_DIR  = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Transcript_Self_Attention/hc_ReadText_selfattn"
PD_TEXT_DIR  = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Transcript_Self_Attention/PD_ReadText_selfattn"
HC_AUDIO_DIR = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/audio_self_attention/HC_ReadText_h_audio_selfattn"
PD_AUDIO_DIR = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/audio_self_attention/PD_ReadText_h_audio_selfattn"

OUT_PATH = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/HC_h_fusion/hc_fused_readtext.npz"
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)

D_TEXT, D_AUDIO, D_CLIP = 768, 768, 512

# ---- utils ----
suffix_re = re.compile(
    r"_(?:h_text(?:_.*)?|h_audio(?:_.*)?|h_graph(?:_.*)?|h_clip(?:_.*)?|hubert_feats|audio_feats|tokens_for_selfattn)$",
    re.IGNORECASE,
)
def base_id_from_name(name: str) -> str:
    base = os.path.splitext(os.path.basename(str(name)))[0]
    return suffix_re.sub("", base)

def map_ids(folder):
    paths = glob.glob(os.path.join(folder, "*.npz")) + glob.glob(os.path.join(folder, "*.npy"))
    return {base_id_from_name(p): p for p in paths}

def pick_key(keys, wanted):
    for k in wanted:
        if k in keys: return k
    return None

def fix_dim(v, D):
    v = np.asarray(v, np.float32).reshape(-1)
    if v.size == D: return v
    out = np.zeros((D,), np.float32); out[:min(D, v.size)] = v[:D]
    return out

TEXT_KEYS  = ["h_text","token_embeddings","X","pooled"]
AUDIO_KEYS = ["h_audio","hubert_embeddings","audio_embeddings","features","hidden_states","X","pooled"]

def load_pooled(path, wanted_keys, expected_dim):
    if path.endswith(".npy"):
        x = np.load(path)
        if x.ndim == 2: x = x.mean(0)
        return fix_dim(x, expected_dim)
    arr = np.load(path, allow_pickle=False)
    try:
        pk = next((k for k in arr.files if "pooled" in k.lower()), None)
        if pk is not None:
            v = arr[pk];  v = v.mean(0) if v.ndim == 2 else np.squeeze(v)
            return fix_dim(v, expected_dim)
        fk = pick_key(arr.files, wanted_keys)
        if fk is None: raise KeyError(f"None of {wanted_keys} in {path}; keys={arr.files}")
        X = arr[fk]
        if X.ndim == 1: v = X
        elif X.ndim == 2: v = X.mean(0)
        elif X.ndim == 3: v = X.mean(1).mean(0)
        else: raise ValueError(f"Unexpected shape {X.shape} in {path}")
        return fix_dim(v, expected_dim)
    finally:
        if hasattr(arr, "close"): arr.close()

# --- robust filename reader (handles UTF-16/UTF-8)
def read_filenames_any_encoding(txt_path):
    with open(txt_path, "rb") as f:
        data = f.read()
    if data.startswith(b"\xff\xfe"):
        enc = "utf-16-le"
    elif data.startswith(b"\xfe\xff"):
        enc = "utf-16-be"
    elif data.startswith(b"\xef\xbb\xbf"):
        enc = "utf-8-sig"
    else:
        try:
            data.decode("utf-8"); enc = "utf-8"
        except UnicodeDecodeError:
            enc = "latin-1"
    return [ln.strip() for ln in data.decode(enc).splitlines() if ln.strip()]

def load_clip_bundle_with_filenames(bundle_npz_path: str, filenames_txt_path: str, expected_dim=512):
    arr = np.load(bundle_npz_path, allow_pickle=False)
    try:
        if "h_graph_nodes" in arr.files:
            X = arr["h_graph_nodes"]          # [1,N,D] or [N,D]
            if X.ndim == 3: X = X[0]
        elif "h_graph" in arr.files and arr["h_graph"].ndim == 2:
            X = arr["h_graph"]                # [N,D]
        else:
            raise KeyError(f"{bundle_npz_path} needs 'h_graph_nodes' or 2D 'h_graph'; keys={arr.files}")
    finally:
        if hasattr(arr, "close"): arr.close()

    names = read_filenames_any_encoding(filenames_txt_path)   # <-- FIXED (no UnicodeDecodeError)
    if len(names) != X.shape[0]:
        raise ValueError(f"Row mismatch: {bundle_npz_path} rows={X.shape[0]} vs {filenames_txt_path} lines={len(names)}")

    out = {}
    for nm, vec in zip(names, X):
        uid = base_id_from_name(nm)
        out[uid] = fix_dim(vec, expected_dim)
    return out

# ---- build maps/lookups ----
clip_vecs = {}
clip_vecs.update(load_clip_bundle_with_filenames(HC_READTEXT_NPZ, HC_READTEXT_TXT, D_CLIP))
clip_vecs.update(load_clip_bundle_with_filenames(PD_READTEXT_NPZ, PD_READTEXT_TXT, D_CLIP))

tmap = {**map_ids(HC_TEXT_DIR),  **map_ids(PD_TEXT_DIR)}
amap = {**map_ids(HC_AUDIO_DIR), **map_ids(PD_AUDIO_DIR)}

print("counts — text:", len(tmap), "audio:", len(amap), "clip:", len(clip_vecs))

common = sorted(set(tmap) & set(amap) & set(clip_vecs))
print("common ids:", len(common))
if not common:
    raise RuntimeError("No overlap across modalities.")

# ---- fuse & label ----
X_list, y_list, ids = [], [], []
for uid in common:
    t = load_pooled(tmap[uid], TEXT_KEYS,  D_TEXT)
    a = load_pooled(amap[uid], AUDIO_KEYS, D_AUDIO)
    c = clip_vecs[uid]
    c = c / (np.linalg.norm(c) + 1e-9)   # optional L2 for CLIP

    x = np.concatenate([t, a, c], -1).astype(np.float32)  # 2048-d
    X_list.append(x)
    y_list.append(0 if "_hc_" in uid.lower() else 1)      # 0=HC, 1=PD
    ids.append(uid)

X = np.stack(X_list, 0)
y = np.array(y_list, dtype=np.int64)
np.savez_compressed(OUT_PATH, X=X, y=y, ids=np.array(ids),
                    D_text=np.array(D_TEXT), D_audio=np.array(D_AUDIO), D_graph=np.array(D_CLIP),
                    note="ReadText: concat [text|audio|h_graph(CLIP)] per ID")
print("Saved:", OUT_PATH, "| X:", X.shape, "| HC:", (y==0).sum(), "PD:", (y==1).sum())

counts — text: 37 audio: 37 clip: 37
common ids: 37
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/HC_h_fusion/hc_fused_readtext.npz | X: (37, 2048) | HC: 21 PD: 16


# Fusing all spontaneous embeddings together so that i can use it for training

In [13]:
import os, re, glob, numpy as np

# ---- paths (ReadText only) ----
HC_READTEXT_NPZ = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/h_graph_selfattn/HC_Spontaneous_Spectrogram_CLIP_features_h_graph_selfattn.npz"
HC_READTEXT_TXT = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/CLIP_Spectrogram_embeddings/HC_Spontaneous_Spectrogram_CLIP_filenames.txt"
PD_READTEXT_NPZ = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/h_graph_selfattn/PD_Spontaneous_Spectrogram_CLIP_features_h_graph_selfattn.npz"
PD_READTEXT_TXT = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/CLIP_Spectrogram_embeddings/PD_Spontaneous_Spectrogram_CLIP_filenames.txt"

HC_TEXT_DIR  = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Transcript_Self_Attention/HC_Spontaneous_selfattn"
PD_TEXT_DIR  = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Transcript_Self_Attention/PD_Spontaneous_selfattn"
HC_AUDIO_DIR = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/audio_self_attention/HC_Spontaneous_h_audio_selfattn"
PD_AUDIO_DIR = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/audio_self_attention/PD_Spontaneous_h_audio_selfattn"

OUT_PATH = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/spontaneous_h_fusion/spontaneous_fused_readtext.npz"
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)

D_TEXT, D_AUDIO, D_CLIP = 768, 768, 512

# ---- utils ----
suffix_re = re.compile(
    r"_(?:h_text(?:_.*)?|h_audio(?:_.*)?|h_graph(?:_.*)?|h_clip(?:_.*)?|hubert_feats|audio_feats|tokens_for_selfattn)$",
    re.IGNORECASE,
)
def base_id_from_name(name: str) -> str:
    base = os.path.splitext(os.path.basename(str(name)))[0]
    return suffix_re.sub("", base)

def map_ids(folder):
    paths = glob.glob(os.path.join(folder, "*.npz")) + glob.glob(os.path.join(folder, "*.npy"))
    return {base_id_from_name(p): p for p in paths}

def pick_key(keys, wanted):
    for k in wanted:
        if k in keys: return k
    return None

def fix_dim(v, D):
    v = np.asarray(v, np.float32).reshape(-1)
    if v.size == D: return v
    out = np.zeros((D,), np.float32); out[:min(D, v.size)] = v[:D]
    return out

TEXT_KEYS  = ["h_text","token_embeddings","X","pooled"]
AUDIO_KEYS = ["h_audio","hubert_embeddings","audio_embeddings","features","hidden_states","X","pooled"]

def load_pooled(path, wanted_keys, expected_dim):
    if path.endswith(".npy"):
        x = np.load(path)
        if x.ndim == 2: x = x.mean(0)
        return fix_dim(x, expected_dim)
    arr = np.load(path, allow_pickle=False)
    try:
        pk = next((k for k in arr.files if "pooled" in k.lower()), None)
        if pk is not None:
            v = arr[pk];  v = v.mean(0) if v.ndim == 2 else np.squeeze(v)
            return fix_dim(v, expected_dim)
        fk = pick_key(arr.files, wanted_keys)
        if fk is None: raise KeyError(f"None of {wanted_keys} in {path}; keys={arr.files}")
        X = arr[fk]
        if X.ndim == 1: v = X
        elif X.ndim == 2: v = X.mean(0)
        elif X.ndim == 3: v = X.mean(1).mean(0)
        else: raise ValueError(f"Unexpected shape {X.shape} in {path}")
        return fix_dim(v, expected_dim)
    finally:
        if hasattr(arr, "close"): arr.close()

# --- robust filename reader (handles UTF-16/UTF-8)
def read_filenames_any_encoding(txt_path):
    with open(txt_path, "rb") as f:
        data = f.read()
    if data.startswith(b"\xff\xfe"):
        enc = "utf-16-le"
    elif data.startswith(b"\xfe\xff"):
        enc = "utf-16-be"
    elif data.startswith(b"\xef\xbb\xbf"):
        enc = "utf-8-sig"
    else:
        try:
            data.decode("utf-8"); enc = "utf-8"
        except UnicodeDecodeError:
            enc = "latin-1"
    return [ln.strip() for ln in data.decode(enc).splitlines() if ln.strip()]

def load_clip_bundle_with_filenames(bundle_npz_path: str, filenames_txt_path: str, expected_dim=512):
    arr = np.load(bundle_npz_path, allow_pickle=False)
    try:
        if "h_graph_nodes" in arr.files:
            X = arr["h_graph_nodes"]          # [1,N,D] or [N,D]
            if X.ndim == 3: X = X[0]
        elif "h_graph" in arr.files and arr["h_graph"].ndim == 2:
            X = arr["h_graph"]                # [N,D]
        else:
            raise KeyError(f"{bundle_npz_path} needs 'h_graph_nodes' or 2D 'h_graph'; keys={arr.files}")
    finally:
        if hasattr(arr, "close"): arr.close()

    names = read_filenames_any_encoding(filenames_txt_path)   # <-- FIXED (no UnicodeDecodeError)
    if len(names) != X.shape[0]:
        raise ValueError(f"Row mismatch: {bundle_npz_path} rows={X.shape[0]} vs {filenames_txt_path} lines={len(names)}")

    out = {}
    for nm, vec in zip(names, X):
        uid = base_id_from_name(nm)
        out[uid] = fix_dim(vec, expected_dim)
    return out

# ---- build maps/lookups ----
clip_vecs = {}
clip_vecs.update(load_clip_bundle_with_filenames(HC_READTEXT_NPZ, HC_READTEXT_TXT, D_CLIP))
clip_vecs.update(load_clip_bundle_with_filenames(PD_READTEXT_NPZ, PD_READTEXT_TXT, D_CLIP))

tmap = {**map_ids(HC_TEXT_DIR),  **map_ids(PD_TEXT_DIR)}
amap = {**map_ids(HC_AUDIO_DIR), **map_ids(PD_AUDIO_DIR)}

print("counts — text:", len(tmap), "audio:", len(amap), "clip:", len(clip_vecs))

common = sorted(set(tmap) & set(amap) & set(clip_vecs))
print("common ids:", len(common))
if not common:
    raise RuntimeError("No overlap across modalities.")

# ---- fuse & label ----
X_list, y_list, ids = [], [], []
for uid in common:
    t = load_pooled(tmap[uid], TEXT_KEYS,  D_TEXT)
    a = load_pooled(amap[uid], AUDIO_KEYS, D_AUDIO)
    c = clip_vecs[uid]
    c = c / (np.linalg.norm(c) + 1e-9)   # optional L2 for CLIP

    x = np.concatenate([t, a, c], -1).astype(np.float32)  # 2048-d
    X_list.append(x)
    y_list.append(0 if "_hc_" in uid.lower() else 1)      # 0=HC, 1=PD
    ids.append(uid)

X = np.stack(X_list, 0)
y = np.array(y_list, dtype=np.int64)
np.savez_compressed(OUT_PATH, X=X, y=y, ids=np.array(ids),
                    D_text=np.array(D_TEXT), D_audio=np.array(D_AUDIO), D_graph=np.array(D_CLIP),
                    note="ReadText: concat [text|audio|h_graph(CLIP)] per ID")
print("Saved:", OUT_PATH, "| X:", X.shape, "| HC:", (y==0).sum(), "PD:", (y==1).sum())

counts — text: 36 audio: 36 clip: 36
common ids: 36
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/spontaneous_h_fusion/spontaneous_fused_readtext.npz | X: (36, 2048) | HC: 20 PD: 16
