In [1]:
import torch
import torch.nn as nn
import numpy as np
import os
import glob as glob

# Self Attention to get h_text

In [2]:
HC_READTEXT_NPZ_DIR = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/HC_ReadText_berts_feats_tokens_only"   # folder with *_tokens_for_selfattn.npz
HC_READTEXT_OUT_DIR      = os.path.join(HC_READTEXT_NPZ_DIR, "..", "hc_ReadText_selfattn")
PATTERN      = "*_tokens_for_selfattn.npz"

DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_N    = 64             # sentences per forward pass; lower if OOM
D_MODEL    = 768
N_HEADS    = 8
N_LAYERS   = 2
D_FF       = 2048
DROPOUT    = 0.1
SAVE_DTYPE = np.float32     # set to np.float16 to save disk
SAVE_POOLED = True          # also save pooled [N, D_MODEL]

os.makedirs(HC_READTEXT_OUT_DIR, exist_ok=True)

In [3]:
# ================= Self-Attention Encoder =================
class SelfAttentionTextEncoder(nn.Module):
    def __init__(self, d_in=768, d_model=768, n_heads=8, n_layers=2, d_ff=2048, dropout=0.1):
        super().__init__()
        self.in_proj = nn.Linear(d_in, d_model) if d_in != d_model else nn.Identity()
        layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=d_ff,
            dropout=dropout, activation="gelu", batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
        self.out_norm = nn.LayerNorm(d_model)

    def forward(self, token_embeddings, attention_mask):
        # token_embeddings: [B, L, d_in], attention_mask: [B, L] (True/1=valid)
        x = self.in_proj(token_embeddings)
        key_padding_mask = ~attention_mask.bool()              # True = ignore
        x = self.encoder(x, src_key_padding_mask=key_padding_mask)
        x = self.out_norm(x)
        x = x.masked_fill((~attention_mask.bool()).unsqueeze(-1), 0.0)  # tidy pads
        return x  # [B, L, d_model]

def masked_mean(x, mask):
    m = mask.unsqueeze(-1).float()
    return (x * m).sum(1) / m.sum(1).clamp(min=1e-6)

# ===================== Main Loop =====================
def main():
    paths = sorted(glob.glob(os.path.join(HC_READTEXT_NPZ_DIR, PATTERN)))
    print(f"Found {len(paths)} NPZ files.")
    if not paths: return

    encoder = None  # build after seeing d_in

    for tpath in paths:
        base = os.path.basename(tpath).replace("_tokens_for_selfattn.npz", "")
        with np.load(tpath, allow_pickle=True) as arr:
            tokens = torch.from_numpy(arr["token_embeddings"]).float()  # [N, L, d_in]
            mask   = torch.from_numpy(arr["attention_mask"]).bool()     # [N, L]

        N, L, d_in = tokens.shape

        # Build encoder once with correct input dim
        if encoder is None:
            encoder = SelfAttentionTextEncoder(
                d_in=d_in, d_model=D_MODEL, n_heads=N_HEADS, n_layers=N_LAYERS, d_ff=D_FF, dropout=DROPOUT
            ).to(DEVICE).eval()

        # Compute h_text in chunks
        h_chunks, pool_chunks = [], []
        for i in range(0, N, BATCH_N):
            x = tokens[i:i+BATCH_N].to(DEVICE)
            m = mask[i:i+BATCH_N].to(DEVICE)
            with torch.no_grad():
                h = encoder(x, m)                    # [b, L, D_MODEL]
            h_chunks.append(h.cpu())
            if SAVE_POOLED:
                pool_chunks.append(masked_mean(h, m).cpu())

        h_text = torch.cat(h_chunks, dim=0).numpy().astype(SAVE_DTYPE)       # [N, L, D_MODEL]
        out_mask = mask.numpy()                                               # [N, L]
        if SAVE_POOLED:
            h_text_pooled = torch.cat(pool_chunks, dim=0).numpy().astype(SAVE_DTYPE)  # [N, D_MODEL]

        out_path = os.path.join(HC_READTEXT_OUT_DIR, f"{base}_h_text_selfattn.npz")
        np.savez_compressed(
            out_path,
            h_text=h_text,
            attention_mask=out_mask,                 # keep the same mask
            h_text_pooled=(h_text_pooled if SAVE_POOLED else None),
            d_model=np.array(D_MODEL),
            n_heads=np.array(N_HEADS),
            n_layers=np.array(N_LAYERS),
            source_text_npz=os.path.basename(tpath),
        )
        print(f"Saved: {out_path} | h_text {h_text.shape}" + (f", pooled {h_text_pooled.shape}" if SAVE_POOLED else ""))

if __name__ == "__main__":
    main()


Found 21 NPZ files.




Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/HC_ReadText_berts_feats_tokens_only/../hc_ReadText_selfattn/ID00_hc_0_0_0_h_text_selfattn.npz | h_text (15, 512, 768), pooled (15, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/HC_ReadText_berts_feats_tokens_only/../hc_ReadText_selfattn/ID01_hc_0_0_0_h_text_selfattn.npz | h_text (24, 512, 768), pooled (24, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/HC_ReadText_berts_feats_tokens_only/../hc_ReadText_selfattn/ID03_hc_0_0_0_h_text_selfattn.npz | h_text (30, 512, 768), pooled (30, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/HC_ReadText_berts_feats_tokens_only/../hc_ReadText_selfattn/ID05_hc_0_0_0_h_text_selfattn.npz | h_text (20, 512, 768), pooled (20, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/HC_ReadText_berts_feats_tokens_

In [4]:
PD_TEXT_NPZ_DIR = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/PD_ReadText_berts_feats_tokens_only"   # folder with *_tokens_for_selfattn.npz
PD_OUT_DIR      = os.path.join(PD_TEXT_NPZ_DIR, "..", "PD_ReadText_selfattn")
PATTERN      = "*_tokens_for_selfattn.npz"

os.makedirs(PD_OUT_DIR, exist_ok=True)

In [5]:
# ================= Self-Attention Encoder =================
class SelfAttentionTextEncoder(nn.Module):
    def __init__(self, d_in=768, d_model=768, n_heads=8, n_layers=2, d_ff=2048, dropout=0.1):
        super().__init__()
        self.in_proj = nn.Linear(d_in, d_model) if d_in != d_model else nn.Identity()
        layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=d_ff,
            dropout=dropout, activation="gelu", batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
        self.out_norm = nn.LayerNorm(d_model)

    def forward(self, token_embeddings, attention_mask):
        # token_embeddings: [B, L, d_in], attention_mask: [B, L] (True/1=valid)
        x = self.in_proj(token_embeddings)
        key_padding_mask = ~attention_mask.bool()              # True = ignore
        x = self.encoder(x, src_key_padding_mask=key_padding_mask)
        x = self.out_norm(x)
        x = x.masked_fill((~attention_mask.bool()).unsqueeze(-1), 0.0)  # tidy pads
        return x  # [B, L, d_model]

def masked_mean(x, mask):
    m = mask.unsqueeze(-1).float()
    return (x * m).sum(1) / m.sum(1).clamp(min=1e-6)

# ===================== Main Loop =====================
def main():
    paths = sorted(glob.glob(os.path.join(PD_TEXT_NPZ_DIR, PATTERN)))
    print(f"Found {len(paths)} NPZ files.")
    if not paths: return

    encoder = None  # build after seeing d_in

    for tpath in paths:
        base = os.path.basename(tpath).replace("_tokens_for_selfattn.npz", "")
        with np.load(tpath, allow_pickle=True) as arr:
            tokens = torch.from_numpy(arr["token_embeddings"]).float()  # [N, L, d_in]
            mask   = torch.from_numpy(arr["attention_mask"]).bool()     # [N, L]

        N, L, d_in = tokens.shape

        # Build encoder once with correct input dim
        if encoder is None:
            encoder = SelfAttentionTextEncoder(
                d_in=d_in, d_model=D_MODEL, n_heads=N_HEADS, n_layers=N_LAYERS, d_ff=D_FF, dropout=DROPOUT
            ).to(DEVICE).eval()

        # Compute h_text in chunks
        h_chunks, pool_chunks = [], []
        for i in range(0, N, BATCH_N):
            x = tokens[i:i+BATCH_N].to(DEVICE)
            m = mask[i:i+BATCH_N].to(DEVICE)
            with torch.no_grad():
                h = encoder(x, m)                    # [b, L, D_MODEL]
            h_chunks.append(h.cpu())
            if SAVE_POOLED:
                pool_chunks.append(masked_mean(h, m).cpu())

        h_text = torch.cat(h_chunks, dim=0).numpy().astype(SAVE_DTYPE)       # [N, L, D_MODEL]
        out_mask = mask.numpy()                                               # [N, L]
        if SAVE_POOLED:
            h_text_pooled = torch.cat(pool_chunks, dim=0).numpy().astype(SAVE_DTYPE)  # [N, D_MODEL]

        out_path = os.path.join(PD_OUT_DIR, f"{base}_h_text_selfattn.npz")
        np.savez_compressed(
            out_path,
            h_text=h_text,
            attention_mask=out_mask,                 # keep the same mask
            h_text_pooled=(h_text_pooled if SAVE_POOLED else None),
            d_model=np.array(D_MODEL),
            n_heads=np.array(N_HEADS),
            n_layers=np.array(N_LAYERS),
            source_text_npz=os.path.basename(tpath),
        )
        print(f"Saved: {out_path} | h_text {h_text.shape}" + (f", pooled {h_text_pooled.shape}" if SAVE_POOLED else ""))

if __name__ == "__main__":
    main()


Found 16 NPZ files.
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/PD_ReadText_berts_feats_tokens_only/../PD_ReadText_selfattn/ID02_pd_2_0_0_h_text_selfattn.npz | h_text (23, 512, 768), pooled (23, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/PD_ReadText_berts_feats_tokens_only/../PD_ReadText_selfattn/ID04_pd_2_0_1_h_text_selfattn.npz | h_text (16, 512, 768), pooled (16, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/PD_ReadText_berts_feats_tokens_only/../PD_ReadText_selfattn/ID06_pd_3_1_1_h_text_selfattn.npz | h_text (23, 512, 768), pooled (23, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/PD_ReadText_berts_feats_tokens_only/../PD_ReadText_selfattn/ID07_pd_2_0_0_h_text_selfattn.npz | h_text (11, 512, 768), pooled (11, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/PD_ReadText

In [6]:
HC_Spontaneous_NPZ_DIR = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/HC_Spontaneous_berts_feats_tokens_only"   # folder with *_tokens_for_selfattn.npz
HC_Spontaneous_OUT_DIR      = os.path.join(HC_Spontaneous_NPZ_DIR, "..", "HC_Spontaneous_selfattn")
PATTERN      = "*_tokens_for_selfattn.npz"

os.makedirs(HC_Spontaneous_OUT_DIR, exist_ok=True)

In [7]:
# ================= Self-Attention Encoder =================
class SelfAttentionTextEncoder(nn.Module):
    def __init__(self, d_in=768, d_model=768, n_heads=8, n_layers=2, d_ff=2048, dropout=0.1):
        super().__init__()
        self.in_proj = nn.Linear(d_in, d_model) if d_in != d_model else nn.Identity()
        layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=d_ff,
            dropout=dropout, activation="gelu", batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
        self.out_norm = nn.LayerNorm(d_model)

    def forward(self, token_embeddings, attention_mask):
        # token_embeddings: [B, L, d_in], attention_mask: [B, L] (True/1=valid)
        x = self.in_proj(token_embeddings)
        key_padding_mask = ~attention_mask.bool()              # True = ignore
        x = self.encoder(x, src_key_padding_mask=key_padding_mask)
        x = self.out_norm(x)
        x = x.masked_fill((~attention_mask.bool()).unsqueeze(-1), 0.0)  # tidy pads
        return x  # [B, L, d_model]

def masked_mean(x, mask):
    m = mask.unsqueeze(-1).float()
    return (x * m).sum(1) / m.sum(1).clamp(min=1e-6)

# ===================== Main Loop =====================
def main():
    paths = sorted(glob.glob(os.path.join(HC_Spontaneous_NPZ_DIR, PATTERN)))
    print(f"Found {len(paths)} NPZ files.")
    if not paths: return

    encoder = None  # build after seeing d_in

    for tpath in paths:
        base = os.path.basename(tpath).replace("_tokens_for_selfattn.npz", "")
        with np.load(tpath, allow_pickle=True) as arr:
            tokens = torch.from_numpy(arr["token_embeddings"]).float()  # [N, L, d_in]
            mask   = torch.from_numpy(arr["attention_mask"]).bool()     # [N, L]

        N, L, d_in = tokens.shape

        # Build encoder once with correct input dim
        if encoder is None:
            encoder = SelfAttentionTextEncoder(
                d_in=d_in, d_model=D_MODEL, n_heads=N_HEADS, n_layers=N_LAYERS, d_ff=D_FF, dropout=DROPOUT
            ).to(DEVICE).eval()

        # Compute h_text in chunks
        h_chunks, pool_chunks = [], []
        for i in range(0, N, BATCH_N):
            x = tokens[i:i+BATCH_N].to(DEVICE)
            m = mask[i:i+BATCH_N].to(DEVICE)
            with torch.no_grad():
                h = encoder(x, m)                    # [b, L, D_MODEL]
            h_chunks.append(h.cpu())
            if SAVE_POOLED:
                pool_chunks.append(masked_mean(h, m).cpu())

        h_text = torch.cat(h_chunks, dim=0).numpy().astype(SAVE_DTYPE)       # [N, L, D_MODEL]
        out_mask = mask.numpy()                                               # [N, L]
        if SAVE_POOLED:
            h_text_pooled = torch.cat(pool_chunks, dim=0).numpy().astype(SAVE_DTYPE)  # [N, D_MODEL]

        out_path = os.path.join(HC_Spontaneous_OUT_DIR, f"{base}_h_text_selfattn.npz")
        np.savez_compressed(
            out_path,
            h_text=h_text,
            attention_mask=out_mask,                 # keep the same mask
            h_text_pooled=(h_text_pooled if SAVE_POOLED else None),
            d_model=np.array(D_MODEL),
            n_heads=np.array(N_HEADS),
            n_layers=np.array(N_LAYERS),
            source_text_npz=os.path.basename(tpath),
        )
        print(f"Saved: {out_path} | h_text {h_text.shape}" + (f", pooled {h_text_pooled.shape}" if SAVE_POOLED else ""))

if __name__ == "__main__":
    main()


Found 21 NPZ files.
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/HC_Spontaneous_berts_feats_tokens_only/../HC_Spontaneous_selfattn/ID00_hc_0_0_0_h_text_selfattn.npz | h_text (58, 512, 768), pooled (58, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/HC_Spontaneous_berts_feats_tokens_only/../HC_Spontaneous_selfattn/ID01_hc_0_0_0_h_text_selfattn.npz | h_text (27, 512, 768), pooled (27, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/HC_Spontaneous_berts_feats_tokens_only/../HC_Spontaneous_selfattn/ID03_hc_0_0_0_h_text_selfattn.npz | h_text (33, 512, 768), pooled (33, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/HC_Spontaneous_berts_feats_tokens_only/../HC_Spontaneous_selfattn/ID05_hc_0_0_0_h_text_selfattn.npz | h_text (35, 512, 768), pooled (35, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_tex

In [9]:
PD_Spontaneous_TEXT_NPZ_DIR = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/PD_Spontaneous_berts_feats_tokens_only"   # folder with *_tokens_for_selfattn.npz
PD_Spontaneous_OUT_DIR      = os.path.join(PD_Spontaneous_TEXT_NPZ_DIR, "..", "PD_Spontaneous_selfattn")
PATTERN      = "*_tokens_for_selfattn.npz"

os.makedirs(PD_Spontaneous_OUT_DIR, exist_ok=True)

In [10]:
# ================= Self-Attention Encoder =================
class SelfAttentionTextEncoder(nn.Module):
    def __init__(self, d_in=768, d_model=768, n_heads=8, n_layers=2, d_ff=2048, dropout=0.1):
        super().__init__()
        self.in_proj = nn.Linear(d_in, d_model) if d_in != d_model else nn.Identity()
        layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=d_ff,
            dropout=dropout, activation="gelu", batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
        self.out_norm = nn.LayerNorm(d_model)

    def forward(self, token_embeddings, attention_mask):
        # token_embeddings: [B, L, d_in], attention_mask: [B, L] (True/1=valid)
        x = self.in_proj(token_embeddings)
        key_padding_mask = ~attention_mask.bool()              # True = ignore
        x = self.encoder(x, src_key_padding_mask=key_padding_mask)
        x = self.out_norm(x)
        x = x.masked_fill((~attention_mask.bool()).unsqueeze(-1), 0.0)  # tidy pads
        return x  # [B, L, d_model]

def masked_mean(x, mask):
    m = mask.unsqueeze(-1).float()
    return (x * m).sum(1) / m.sum(1).clamp(min=1e-6)

# ===================== Main Loop =====================
def main():
    paths = sorted(glob.glob(os.path.join(PD_Spontaneous_TEXT_NPZ_DIR, PATTERN)))
    print(f"Found {len(paths)} NPZ files.")
    if not paths: return

    encoder = None  # build after seeing d_in

    for tpath in paths:
        base = os.path.basename(tpath).replace("_tokens_for_selfattn.npz", "")
        with np.load(tpath, allow_pickle=True) as arr:
            tokens = torch.from_numpy(arr["token_embeddings"]).float()  # [N, L, d_in]
            mask   = torch.from_numpy(arr["attention_mask"]).bool()     # [N, L]

        N, L, d_in = tokens.shape

        # Build encoder once with correct input dim
        if encoder is None:
            encoder = SelfAttentionTextEncoder(
                d_in=d_in, d_model=D_MODEL, n_heads=N_HEADS, n_layers=N_LAYERS, d_ff=D_FF, dropout=DROPOUT
            ).to(DEVICE).eval()

        # Compute h_text in chunks
        h_chunks, pool_chunks = [], []
        for i in range(0, N, BATCH_N):
            x = tokens[i:i+BATCH_N].to(DEVICE)
            m = mask[i:i+BATCH_N].to(DEVICE)
            with torch.no_grad():
                h = encoder(x, m)                    # [b, L, D_MODEL]
            h_chunks.append(h.cpu())
            if SAVE_POOLED:
                pool_chunks.append(masked_mean(h, m).cpu())

        h_text = torch.cat(h_chunks, dim=0).numpy().astype(SAVE_DTYPE)       # [N, L, D_MODEL]
        out_mask = mask.numpy()                                               # [N, L]
        if SAVE_POOLED:
            h_text_pooled = torch.cat(pool_chunks, dim=0).numpy().astype(SAVE_DTYPE)  # [N, D_MODEL]

        out_path = os.path.join(PD_Spontaneous_OUT_DIR, f"{base}_h_text_selfattn.npz")
        np.savez_compressed(
            out_path,
            h_text=h_text,
            attention_mask=out_mask,                 # keep the same mask
            h_text_pooled=(h_text_pooled if SAVE_POOLED else None),
            d_model=np.array(D_MODEL),
            n_heads=np.array(N_HEADS),
            n_layers=np.array(N_LAYERS),
            source_text_npz=os.path.basename(tpath),
        )
        print(f"Saved: {out_path} | h_text {h_text.shape}" + (f", pooled {h_text_pooled.shape}" if SAVE_POOLED else ""))

if __name__ == "__main__":
    main()


Found 15 NPZ files.
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/PD_Spontaneous_berts_feats_tokens_only/../PD_Spontaneous_selfattn/ID02_pd_2_0_0_h_text_selfattn.npz | h_text (32, 512, 768), pooled (32, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/PD_Spontaneous_berts_feats_tokens_only/../PD_Spontaneous_selfattn/ID04_pd_2_0_1_h_text_selfattn.npz | h_text (35, 512, 768), pooled (35, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/PD_Spontaneous_berts_feats_tokens_only/../PD_Spontaneous_selfattn/ID06_pd_3_1_1_h_text_selfattn.npz | h_text (23, 512, 768), pooled (23, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_text_embeddings/PD_Spontaneous_berts_feats_tokens_only/../PD_Spontaneous_selfattn/ID07_pd_2_0_0_h_text_selfattn.npz | h_text (40, 512, 768), pooled (40, 768)
Saved: /mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/berts_tex

# Self Attention to get the h_audio

In [11]:
import os, glob, numpy as np, torch
import torch.nn as nn

# ===================== Config =====================
AUDIO_NPZ_DIR = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/hubert_audio/HC_ReadText_hubert_features"   # <-- change to your folder with *_hubert_feats.npz
OUT_DIR       = os.path.join(AUDIO_NPZ_DIR, "..", "h_audio_selfattn")
PATTERNS      = ["*_hubert_feats.npz", "*_audio_feats.npz", "*.npz"]  # will auto-pick the right keys

DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_N     = 64             # sequences per forward pass when input is [N, L, D]
D_MODEL     = None           # None -> use d_in; or set e.g. 512/768/1024 to project
N_HEADS     = 8
N_LAYERS    = 2
D_FF        = 2048
DROPOUT     = 0.1
SAVE_DTYPE  = np.float32     # use np.float16 to halve disk space
SAVE_POOLED = True

os.makedirs(OUT_DIR, exist_ok=True)
assert os.path.isdir(AUDIO_NPZ_DIR), f"Missing input dir: {AUDIO_NPZ_DIR}"
assert os.path.isdir(OUT_DIR),       f"Missing output dir: {OUT_DIR}"

# ================= Model =================
class SelfAttentionAudioEncoder(nn.Module):
    def __init__(self, d_in=768, d_model=768, n_heads=8, n_layers=2, d_ff=2048, dropout=0.1):
        super().__init__()
        self.in_proj = nn.Linear(d_in, d_model) if d_in != d_model else nn.Identity()
        layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=d_ff,
            dropout=dropout, activation="gelu", batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
        self.out_norm = nn.LayerNorm(d_model)

    def forward(self, audio_embeddings, attention_mask):
        """
        audio_embeddings: [B, L, d_in]  or [L, d_in]
        attention_mask:   [B, L]        or [L]  (1/True=valid, 0/False=pad)
        returns:          [B, L, d_model]
        """
        if audio_embeddings.dim() == 2:
            audio_embeddings = audio_embeddings.unsqueeze(0)
        if attention_mask.dim() == 1:
            attention_mask = attention_mask.unsqueeze(0)

        x = self.in_proj(audio_embeddings)                 # [B, L, d_model]
        key_padding_mask = ~attention_mask.bool()          # True=ignore
        x = self.encoder(x, src_key_padding_mask=key_padding_mask)
        x = self.out_norm(x)
        x = x.masked_fill((~attention_mask.bool()).unsqueeze(-1), 0.0)  # tidy pads
        return x

def masked_mean(x, mask):
    # x: [B, L, D], mask: [B, L] (True=valid)
    m = mask.unsqueeze(-1).float()
    return (x * m).sum(1) / m.sum(1).clamp(min=1e-6)      # [B, D]

# ================= I/O helpers =================
def pick_feat_key(keys):
    for k in ["hubert_embeddings", "audio_embeddings", "features", "hidden_states"]:
        if k in keys: return k
    raise KeyError(f"No audio feature key found in {sorted(keys)}")

def load_audio_npz(path):
    with np.load(path, allow_pickle=True, mmap_mode="r") as arr:
        fk = pick_feat_key(arr.files)
        feats = arr[fk]  # np.ndarray: [L, D] or [N, L, D]
        if "attention_mask" in arr.files:
            mask = arr["attention_mask"]
        elif feats.ndim == 3:
            # if no mask, assume all valid
            mask = np.ones((feats.shape[0], feats.shape[1]), dtype=bool)
        else:
            mask = np.ones((feats.shape[0],), dtype=bool)
    return feats, mask  # numpy arrays

# ================= Main =================
def main():
    # collect files
    paths = []
    for pat in PATTERNS:
        paths.extend(glob.glob(os.path.join(AUDIO_NPZ_DIR, pat)))
    # de-dup and sort
    paths = sorted(set(paths))
    print(f"Found {len(paths)} NPZ files to process.")

    encoder = None
    d_in_cached = None
    d_model = None

    for p in paths:
        base = os.path.splitext(os.path.basename(p))[0]
        out_path = os.path.join(OUT_DIR, f"{base.replace('_hubert_feats','').replace('_audio_feats','')}_h_audio_selfattn.npz")
        if os.path.exists(out_path):
            print(f"[skip] {os.path.basename(out_path)} (exists)")
            continue

        try:
            feats_np, mask_np = load_audio_npz(p)  # feats: [L,D] or [N,L,D]; mask: [L] or [N,L]
        except Exception as e:
            print(f"[err]  {os.path.basename(p)}: {e}")
            continue

        # standardize to torch
        feats = torch.from_numpy(feats_np).float()
        mask  = torch.from_numpy(mask_np).bool()

        # infer dimensions
        if feats.dim() == 2:
            L, d_in = feats.shape
            N = 1
        elif feats.dim() == 3:
            N, L, d_in = feats.shape
        else:
            print(f"[err]  {os.path.basename(p)}: unexpected feature shape {feats.shape}")
            continue

        # build encoder once (or rebuild if d_in changes across files)
        if d_in_cached != d_in:
            d_in_cached = d_in
            d_model = d_in if D_MODEL is None else int(D_MODEL)
            encoder = SelfAttentionAudioEncoder(
                d_in=d_in, d_model=d_model, n_heads=N_HEADS, n_layers=N_LAYERS, d_ff=D_FF, dropout=DROPOUT
            ).to(DEVICE).eval()
            print(f"[info] initialized encoder: d_in={d_in}, d_model={d_model}, heads={N_HEADS}, layers={N_LAYERS}")

        # run encoder
        with torch.no_grad():
            if feats.dim() == 2:
                h = encoder(feats.to(DEVICE), mask.to(DEVICE))                # [1, L, d_model]
                pooled = masked_mean(h, mask.unsqueeze(0).to(DEVICE)) if SAVE_POOLED else None  # [1, d_model]
                h_np   = h.cpu().numpy().astype(SAVE_DTYPE)                   # [1, L, d_model]
                m_np   = mask.cpu().numpy()                                   # [L]
                poolnp = None if pooled is None else pooled.cpu().numpy().astype(SAVE_DTYPE)  # [1, d_model]
            else:
                # [N, L, D] -> process in chunks along N
                h_chunks, pool_chunks = [], []
                for i in range(0, N, BATCH_N):
                    xb = feats[i:i+BATCH_N].to(DEVICE)                        # [b, L, d_in]
                    mb = mask[i:i+BATCH_N].to(DEVICE)                         # [b, L]
                    hb = encoder(xb, mb)                                      # [b, L, d_model]
                    h_chunks.append(hb.cpu())
                    if SAVE_POOLED:
                        pool_chunks.append(masked_mean(hb, mb).cpu())         # [b, d_model]
                h_np   = torch.cat(h_chunks, 0).numpy().astype(SAVE_DTYPE)    # [N, L, d_model]
                m_np   = mask.cpu().numpy()                                    # [N, L]
                poolnp = None if not SAVE_POOLED else torch.cat(pool_chunks, 0).numpy().astype(SAVE_DTYPE)  # [N, d_model]

        # save
        np.savez_compressed(
            out_path,
            h_audio=h_np,
            attention_mask=m_np,
            h_audio_pooled=poolnp,
            d_model=np.array(d_model),
            n_heads=np.array(N_HEADS),
            n_layers=np.array(N_LAYERS),
            source_audio_npz=os.path.basename(p),
        )
        print(f"[ok]   {os.path.basename(p)} -> h_audio {h_np.shape}" + (f", pooled {poolnp.shape}" if SAVE_POOLED else ""))

if __name__ == "__main__":
    main()


Found 21 NPZ files to process.
[info] initialized encoder: d_in=768, d_model=768, heads=8, layers=2
[ok]   ID00_hc_0_0_0_hubert_feats.npz -> h_audio (1, 7550, 768), pooled (1, 768)
[ok]   ID01_hc_0_0_0_hubert_feats.npz -> h_audio (1, 8200, 768), pooled (1, 768)
[ok]   ID03_hc_0_0_0_hubert_feats.npz -> h_audio (1, 6973, 768), pooled (1, 768)
[ok]   ID05_hc_0_0_0_hubert_feats.npz -> h_audio (1, 5543, 768), pooled (1, 768)
[ok]   ID08_hc_0_0_0_hubert_feats.npz -> h_audio (1, 7300, 768), pooled (1, 768)
[ok]   ID09_hc_0_0_0_hubert_feats.npz -> h_audio (1, 6269, 768), pooled (1, 768)
[ok]   ID10_hc_0_0_0_hubert_feats.npz -> h_audio (1, 6834, 768), pooled (1, 768)
[ok]   ID11_hc_0_0_0_hubert_feats.npz -> h_audio (1, 7415, 768), pooled (1, 768)
[ok]   ID12_hc_0_0_0_hubert_feats.npz -> h_audio (1, 6522, 768), pooled (1, 768)
[ok]   ID14_hc_0_0_0_hubert_feats.npz -> h_audio (1, 8415, 768), pooled (1, 768)
[ok]   ID15_hc_0_0_0_hubert_feats.npz -> h_audio (1, 8526, 768), pooled (1, 768)
[ok]   ID

In [12]:
import os, glob, numpy as np, torch
import torch.nn as nn

# ===================== Config =====================
HC_Spontaneous_AUDIO_NPZ_DIR = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/hubert_audio/HC_Spontaneous_hubert_features"   # <-- change to your folder with *_hubert_feats.npz
HC_Spontaneous_OUT_DIR       = os.path.join(HC_Spontaneous_AUDIO_NPZ_DIR, "..", "HC_Spontaneous_h_audio_selfattn")
PATTERNS      = ["*_hubert_feats.npz", "*_audio_feats.npz", "*.npz"]  # will auto-pick the right keys

DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_N     = 64             # sequences per forward pass when input is [N, L, D]
D_MODEL     = None           # None -> use d_in; or set e.g. 512/768/1024 to project
N_HEADS     = 8
N_LAYERS    = 2
D_FF        = 2048
DROPOUT     = 0.1
SAVE_DTYPE  = np.float32     # use np.float16 to halve disk space
SAVE_POOLED = True

os.makedirs(HC_Spontaneous_OUT_DIR, exist_ok=True)
assert os.path.isdir(HC_Spontaneous_AUDIO_NPZ_DIR), f"Missing input dir: {HC_Spontaneous_AUDIO_NPZ_DIR}"
assert os.path.isdir(HC_Spontaneous_OUT_DIR),       f"Missing output dir: {HC_Spontaneous_OUT_DIR}"

# ================= Model =================
class SelfAttentionAudioEncoder(nn.Module):
    def __init__(self, d_in=768, d_model=768, n_heads=8, n_layers=2, d_ff=2048, dropout=0.1):
        super().__init__()
        self.in_proj = nn.Linear(d_in, d_model) if d_in != d_model else nn.Identity()
        layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=d_ff,
            dropout=dropout, activation="gelu", batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
        self.out_norm = nn.LayerNorm(d_model)

    def forward(self, audio_embeddings, attention_mask):
        """
        audio_embeddings: [B, L, d_in]  or [L, d_in]
        attention_mask:   [B, L]        or [L]  (1/True=valid, 0/False=pad)
        returns:          [B, L, d_model]
        """
        if audio_embeddings.dim() == 2:
            audio_embeddings = audio_embeddings.unsqueeze(0)
        if attention_mask.dim() == 1:
            attention_mask = attention_mask.unsqueeze(0)

        x = self.in_proj(audio_embeddings)                 # [B, L, d_model]
        key_padding_mask = ~attention_mask.bool()          # True=ignore
        x = self.encoder(x, src_key_padding_mask=key_padding_mask)
        x = self.out_norm(x)
        x = x.masked_fill((~attention_mask.bool()).unsqueeze(-1), 0.0)  # tidy pads
        return x

def masked_mean(x, mask):
    # x: [B, L, D], mask: [B, L] (True=valid)
    m = mask.unsqueeze(-1).float()
    return (x * m).sum(1) / m.sum(1).clamp(min=1e-6)      # [B, D]

# ================= I/O helpers =================
def pick_feat_key(keys):
    for k in ["hubert_embeddings", "audio_embeddings", "features", "hidden_states"]:
        if k in keys: return k
    raise KeyError(f"No audio feature key found in {sorted(keys)}")

def load_audio_npz(path):
    with np.load(path, allow_pickle=True, mmap_mode="r") as arr:
        fk = pick_feat_key(arr.files)
        feats = arr[fk]  # np.ndarray: [L, D] or [N, L, D]
        if "attention_mask" in arr.files:
            mask = arr["attention_mask"]
        elif feats.ndim == 3:
            # if no mask, assume all valid
            mask = np.ones((feats.shape[0], feats.shape[1]), dtype=bool)
        else:
            mask = np.ones((feats.shape[0],), dtype=bool)
    return feats, mask  # numpy arrays

# ================= Main =================
def main():
    # collect files
    paths = []
    for pat in PATTERNS:
        paths.extend(glob.glob(os.path.join(HC_Spontaneous_AUDIO_NPZ_DIR, pat)))
    # de-dup and sort
    paths = sorted(set(paths))
    print(f"Found {len(paths)} NPZ files to process.")

    encoder = None
    d_in_cached = None
    d_model = None

    for p in paths:
        base = os.path.splitext(os.path.basename(p))[0]
        out_path = os.path.join(HC_Spontaneous_OUT_DIR, f"{base.replace('_hubert_feats','').replace('_audio_feats','')}_h_audio_selfattn.npz")
        if os.path.exists(out_path):
            print(f"[skip] {os.path.basename(out_path)} (exists)")
            continue

        try:
            feats_np, mask_np = load_audio_npz(p)  # feats: [L,D] or [N,L,D]; mask: [L] or [N,L]
        except Exception as e:
            print(f"[err]  {os.path.basename(p)}: {e}")
            continue

        # standardize to torch
        feats = torch.from_numpy(feats_np).float()
        mask  = torch.from_numpy(mask_np).bool()

        # infer dimensions
        if feats.dim() == 2:
            L, d_in = feats.shape
            N = 1
        elif feats.dim() == 3:
            N, L, d_in = feats.shape
        else:
            print(f"[err]  {os.path.basename(p)}: unexpected feature shape {feats.shape}")
            continue

        # build encoder once (or rebuild if d_in changes across files)
        if d_in_cached != d_in:
            d_in_cached = d_in
            d_model = d_in if D_MODEL is None else int(D_MODEL)
            encoder = SelfAttentionAudioEncoder(
                d_in=d_in, d_model=d_model, n_heads=N_HEADS, n_layers=N_LAYERS, d_ff=D_FF, dropout=DROPOUT
            ).to(DEVICE).eval()
            print(f"[info] initialized encoder: d_in={d_in}, d_model={d_model}, heads={N_HEADS}, layers={N_LAYERS}")

        # run encoder
        with torch.no_grad():
            if feats.dim() == 2:
                h = encoder(feats.to(DEVICE), mask.to(DEVICE))                # [1, L, d_model]
                pooled = masked_mean(h, mask.unsqueeze(0).to(DEVICE)) if SAVE_POOLED else None  # [1, d_model]
                h_np   = h.cpu().numpy().astype(SAVE_DTYPE)                   # [1, L, d_model]
                m_np   = mask.cpu().numpy()                                   # [L]
                poolnp = None if pooled is None else pooled.cpu().numpy().astype(SAVE_DTYPE)  # [1, d_model]
            else:
                # [N, L, D] -> process in chunks along N
                h_chunks, pool_chunks = [], []
                for i in range(0, N, BATCH_N):
                    xb = feats[i:i+BATCH_N].to(DEVICE)                        # [b, L, d_in]
                    mb = mask[i:i+BATCH_N].to(DEVICE)                         # [b, L]
                    hb = encoder(xb, mb)                                      # [b, L, d_model]
                    h_chunks.append(hb.cpu())
                    if SAVE_POOLED:
                        pool_chunks.append(masked_mean(hb, mb).cpu())         # [b, d_model]
                h_np   = torch.cat(h_chunks, 0).numpy().astype(SAVE_DTYPE)    # [N, L, d_model]
                m_np   = mask.cpu().numpy()                                    # [N, L]
                poolnp = None if not SAVE_POOLED else torch.cat(pool_chunks, 0).numpy().astype(SAVE_DTYPE)  # [N, d_model]

        # save
        np.savez_compressed(
            out_path,
            h_audio=h_np,
            attention_mask=m_np,
            h_audio_pooled=poolnp,
            d_model=np.array(d_model),
            n_heads=np.array(N_HEADS),
            n_layers=np.array(N_LAYERS),
            source_audio_npz=os.path.basename(p),
        )
        print(f"[ok]   {os.path.basename(p)} -> h_audio {h_np.shape}" + (f", pooled {poolnp.shape}" if SAVE_POOLED else ""))

if __name__ == "__main__":
    main()


Found 21 NPZ files to process.
[info] initialized encoder: d_in=768, d_model=768, heads=8, layers=2
[ok]   ID00_hc_0_0_0_hubert_feats.npz -> h_audio (1, 5988, 768), pooled (1, 768)
[ok]   ID01_hc_0_0_0_hubert_feats.npz -> h_audio (1, 5850, 768), pooled (1, 768)
[ok]   ID03_hc_0_0_0_hubert_feats.npz -> h_audio (1, 7700, 768), pooled (1, 768)
[ok]   ID05_hc_0_0_0_hubert_feats.npz -> h_audio (1, 9218, 768), pooled (1, 768)
[ok]   ID08_hc_0_0_0_hubert_feats.npz -> h_audio (1, 6414, 768), pooled (1, 768)
[ok]   ID09_hc_0_0_0_hubert_feats.npz -> h_audio (1, 7060, 768), pooled (1, 768)
[ok]   ID10_hc_0_0_0_hubert_feats.npz -> h_audio (1, 8467, 768), pooled (1, 768)
[ok]   ID11_hc_0_0_0_hubert_feats.npz -> h_audio (1, 8305, 768), pooled (1, 768)
[ok]   ID12_hc_0_0_0_hubert_feats.npz -> h_audio (1, 8255, 768), pooled (1, 768)
[ok]   ID14_hc_0_0_0_hubert_feats.npz -> h_audio (1, 6938, 768), pooled (1, 768)
[ok]   ID15_hc_0_0_0_hubert_feats.npz -> h_audio (1, 11024, 768), pooled (1, 768)
[ok]   I

In [13]:
import os, glob, numpy as np, torch
import torch.nn as nn

# ===================== Config =====================
PD_ReadText_AUDIO_NPZ_DIR = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/hubert_audio/PD_ReadText_hubert_features"   # <-- change to your folder with *_hubert_feats.npz
PD_ReadText_OUT_DIR       = os.path.join(PD_ReadText_AUDIO_NPZ_DIR, "..", "PD_ReadText_h_audio_selfattn")
PATTERNS      = ["*_hubert_feats.npz", "*_audio_feats.npz", "*.npz"]  # will auto-pick the right keys

DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_N     = 64             # sequences per forward pass when input is [N, L, D]
D_MODEL     = None           # None -> use d_in; or set e.g. 512/768/1024 to project
N_HEADS     = 8
N_LAYERS    = 2
D_FF        = 2048
DROPOUT     = 0.1
SAVE_DTYPE  = np.float32     # use np.float16 to halve disk space
SAVE_POOLED = True

os.makedirs(PD_ReadText_OUT_DIR, exist_ok=True)
assert os.path.isdir(PD_ReadText_AUDIO_NPZ_DIR), f"Missing input dir: {PD_ReadText_AUDIO_NPZ_DIR}"
assert os.path.isdir(PD_ReadText_OUT_DIR),       f"Missing output dir: {PD_ReadText_OUT_DIR}"

# ================= Model =================
class SelfAttentionAudioEncoder(nn.Module):
    def __init__(self, d_in=768, d_model=768, n_heads=8, n_layers=2, d_ff=2048, dropout=0.1):
        super().__init__()
        self.in_proj = nn.Linear(d_in, d_model) if d_in != d_model else nn.Identity()
        layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=d_ff,
            dropout=dropout, activation="gelu", batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
        self.out_norm = nn.LayerNorm(d_model)

    def forward(self, audio_embeddings, attention_mask):
        """
        audio_embeddings: [B, L, d_in]  or [L, d_in]
        attention_mask:   [B, L]        or [L]  (1/True=valid, 0/False=pad)
        returns:          [B, L, d_model]
        """
        if audio_embeddings.dim() == 2:
            audio_embeddings = audio_embeddings.unsqueeze(0)
        if attention_mask.dim() == 1:
            attention_mask = attention_mask.unsqueeze(0)

        x = self.in_proj(audio_embeddings)                 # [B, L, d_model]
        key_padding_mask = ~attention_mask.bool()          # True=ignore
        x = self.encoder(x, src_key_padding_mask=key_padding_mask)
        x = self.out_norm(x)
        x = x.masked_fill((~attention_mask.bool()).unsqueeze(-1), 0.0)  # tidy pads
        return x

def masked_mean(x, mask):
    # x: [B, L, D], mask: [B, L] (True=valid)
    m = mask.unsqueeze(-1).float()
    return (x * m).sum(1) / m.sum(1).clamp(min=1e-6)      # [B, D]

# ================= I/O helpers =================
def pick_feat_key(keys):
    for k in ["hubert_embeddings", "audio_embeddings", "features", "hidden_states"]:
        if k in keys: return k
    raise KeyError(f"No audio feature key found in {sorted(keys)}")

def load_audio_npz(path):
    with np.load(path, allow_pickle=True, mmap_mode="r") as arr:
        fk = pick_feat_key(arr.files)
        feats = arr[fk]  # np.ndarray: [L, D] or [N, L, D]
        if "attention_mask" in arr.files:
            mask = arr["attention_mask"]
        elif feats.ndim == 3:
            # if no mask, assume all valid
            mask = np.ones((feats.shape[0], feats.shape[1]), dtype=bool)
        else:
            mask = np.ones((feats.shape[0],), dtype=bool)
    return feats, mask  # numpy arrays

# ================= Main =================
def main():
    # collect files
    paths = []
    for pat in PATTERNS:
        paths.extend(glob.glob(os.path.join(PD_ReadText_AUDIO_NPZ_DIR, pat)))
    # de-dup and sort
    paths = sorted(set(paths))
    print(f"Found {len(paths)} NPZ files to process.")

    encoder = None
    d_in_cached = None
    d_model = None

    for p in paths:
        base = os.path.splitext(os.path.basename(p))[0]
        out_path = os.path.join(PD_ReadText_OUT_DIR, f"{base.replace('_hubert_feats','').replace('_audio_feats','')}_h_audio_selfattn.npz")
        if os.path.exists(out_path):
            print(f"[skip] {os.path.basename(out_path)} (exists)")
            continue

        try:
            feats_np, mask_np = load_audio_npz(p)  # feats: [L,D] or [N,L,D]; mask: [L] or [N,L]
        except Exception as e:
            print(f"[err]  {os.path.basename(p)}: {e}")
            continue

        # standardize to torch
        feats = torch.from_numpy(feats_np).float()
        mask  = torch.from_numpy(mask_np).bool()

        # infer dimensions
        if feats.dim() == 2:
            L, d_in = feats.shape
            N = 1
        elif feats.dim() == 3:
            N, L, d_in = feats.shape
        else:
            print(f"[err]  {os.path.basename(p)}: unexpected feature shape {feats.shape}")
            continue

        # build encoder once (or rebuild if d_in changes across files)
        if d_in_cached != d_in:
            d_in_cached = d_in
            d_model = d_in if D_MODEL is None else int(D_MODEL)
            encoder = SelfAttentionAudioEncoder(
                d_in=d_in, d_model=d_model, n_heads=N_HEADS, n_layers=N_LAYERS, d_ff=D_FF, dropout=DROPOUT
            ).to(DEVICE).eval()
            print(f"[info] initialized encoder: d_in={d_in}, d_model={d_model}, heads={N_HEADS}, layers={N_LAYERS}")

        # run encoder
        with torch.no_grad():
            if feats.dim() == 2:
                h = encoder(feats.to(DEVICE), mask.to(DEVICE))                # [1, L, d_model]
                pooled = masked_mean(h, mask.unsqueeze(0).to(DEVICE)) if SAVE_POOLED else None  # [1, d_model]
                h_np   = h.cpu().numpy().astype(SAVE_DTYPE)                   # [1, L, d_model]
                m_np   = mask.cpu().numpy()                                   # [L]
                poolnp = None if pooled is None else pooled.cpu().numpy().astype(SAVE_DTYPE)  # [1, d_model]
            else:
                # [N, L, D] -> process in chunks along N
                h_chunks, pool_chunks = [], []
                for i in range(0, N, BATCH_N):
                    xb = feats[i:i+BATCH_N].to(DEVICE)                        # [b, L, d_in]
                    mb = mask[i:i+BATCH_N].to(DEVICE)                         # [b, L]
                    hb = encoder(xb, mb)                                      # [b, L, d_model]
                    h_chunks.append(hb.cpu())
                    if SAVE_POOLED:
                        pool_chunks.append(masked_mean(hb, mb).cpu())         # [b, d_model]
                h_np   = torch.cat(h_chunks, 0).numpy().astype(SAVE_DTYPE)    # [N, L, d_model]
                m_np   = mask.cpu().numpy()                                    # [N, L]
                poolnp = None if not SAVE_POOLED else torch.cat(pool_chunks, 0).numpy().astype(SAVE_DTYPE)  # [N, d_model]

        # save
        np.savez_compressed(
            out_path,
            h_audio=h_np,
            attention_mask=m_np,
            h_audio_pooled=poolnp,
            d_model=np.array(d_model),
            n_heads=np.array(N_HEADS),
            n_layers=np.array(N_LAYERS),
            source_audio_npz=os.path.basename(p),
        )
        print(f"[ok]   {os.path.basename(p)} -> h_audio {h_np.shape}" + (f", pooled {poolnp.shape}" if SAVE_POOLED else ""))

if __name__ == "__main__":
    main()


Found 16 NPZ files to process.
[info] initialized encoder: d_in=768, d_model=768, heads=8, layers=2
[ok]   ID02_pd_2_0_0_hubert_feats.npz -> h_audio (1, 7872, 768), pooled (1, 768)
[ok]   ID04_pd_2_0_1_hubert_feats.npz -> h_audio (1, 6119, 768), pooled (1, 768)
[ok]   ID06_pd_3_1_1_hubert_feats.npz -> h_audio (1, 8940, 768), pooled (1, 768)
[ok]   ID07_pd_2_0_0_hubert_feats.npz -> h_audio (1, 7379, 768), pooled (1, 768)
[ok]   ID13_pd_3_2_2_hubert_feats.npz -> h_audio (1, 4664, 768), pooled (1, 768)
[ok]   ID16_pd_2_0_0_hubert_feats.npz -> h_audio (1, 8357, 768), pooled (1, 768)
[ok]   ID17_pd_2_1_0_hubert_feats.npz -> h_audio (1, 5467, 768), pooled (1, 768)
[ok]   ID18_pd_4_3_3_hubert_feats.npz -> h_audio (1, 4284, 768), pooled (1, 768)
[ok]   ID20_pd_3_0_1_hubert_feats.npz -> h_audio (1, 7049, 768), pooled (1, 768)
[ok]   ID24_pd_2_0_0_hubert_feats.npz -> h_audio (1, 7092, 768), pooled (1, 768)
[ok]   ID27_pd_4_1_1_hubert_feats.npz -> h_audio (1, 4186, 768), pooled (1, 768)
[ok]   ID

In [14]:
import os, glob, numpy as np, torch
import torch.nn as nn

# ===================== Config =====================
PD_Spontaneous_AUDIO_NPZ_DIR = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/hubert_audio/PD_Spontaneous_hubert_features"   # <-- change to your folder with *_hubert_feats.npz
PD_Spontaneous_OUT_DIR       = os.path.join(PD_Spontaneous_AUDIO_NPZ_DIR, "..", "PD_Spontaneous_h_audio_selfattn")
PATTERNS      = ["*_hubert_feats.npz", "*_audio_feats.npz", "*.npz"]  # will auto-pick the right keys

DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_N     = 64             # sequences per forward pass when input is [N, L, D]
D_MODEL     = None           # None -> use d_in; or set e.g. 512/768/1024 to project
N_HEADS     = 8
N_LAYERS    = 2
D_FF        = 2048
DROPOUT     = 0.1
SAVE_DTYPE  = np.float32     # use np.float16 to halve disk space
SAVE_POOLED = True

os.makedirs(PD_Spontaneous_OUT_DIR, exist_ok=True)
assert os.path.isdir(PD_Spontaneous_AUDIO_NPZ_DIR), f"Missing input dir: {PD_Spontaneous_AUDIO_NPZ_DIR}"
assert os.path.isdir(PD_Spontaneous_OUT_DIR),       f"Missing output dir: {PD_Spontaneous_OUT_DIR}"

# ================= Model =================
class SelfAttentionAudioEncoder(nn.Module):
    def __init__(self, d_in=768, d_model=768, n_heads=8, n_layers=2, d_ff=2048, dropout=0.1):
        super().__init__()
        self.in_proj = nn.Linear(d_in, d_model) if d_in != d_model else nn.Identity()
        layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=d_ff,
            dropout=dropout, activation="gelu", batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
        self.out_norm = nn.LayerNorm(d_model)

    def forward(self, audio_embeddings, attention_mask):
        """
        audio_embeddings: [B, L, d_in]  or [L, d_in]
        attention_mask:   [B, L]        or [L]  (1/True=valid, 0/False=pad)
        returns:          [B, L, d_model]
        """
        if audio_embeddings.dim() == 2:
            audio_embeddings = audio_embeddings.unsqueeze(0)
        if attention_mask.dim() == 1:
            attention_mask = attention_mask.unsqueeze(0)

        x = self.in_proj(audio_embeddings)                 # [B, L, d_model]
        key_padding_mask = ~attention_mask.bool()          # True=ignore
        x = self.encoder(x, src_key_padding_mask=key_padding_mask)
        x = self.out_norm(x)
        x = x.masked_fill((~attention_mask.bool()).unsqueeze(-1), 0.0)  # tidy pads
        return x

def masked_mean(x, mask):
    # x: [B, L, D], mask: [B, L] (True=valid)
    m = mask.unsqueeze(-1).float()
    return (x * m).sum(1) / m.sum(1).clamp(min=1e-6)      # [B, D]

# ================= I/O helpers =================
def pick_feat_key(keys):
    for k in ["hubert_embeddings", "audio_embeddings", "features", "hidden_states"]:
        if k in keys: return k
    raise KeyError(f"No audio feature key found in {sorted(keys)}")

def load_audio_npz(path):
    with np.load(path, allow_pickle=True, mmap_mode="r") as arr:
        fk = pick_feat_key(arr.files)
        feats = arr[fk]  # np.ndarray: [L, D] or [N, L, D]
        if "attention_mask" in arr.files:
            mask = arr["attention_mask"]
        elif feats.ndim == 3:
            # if no mask, assume all valid
            mask = np.ones((feats.shape[0], feats.shape[1]), dtype=bool)
        else:
            mask = np.ones((feats.shape[0],), dtype=bool)
    return feats, mask  # numpy arrays

# ================= Main =================
def main():
    # collect files
    paths = []
    for pat in PATTERNS:
        paths.extend(glob.glob(os.path.join(PD_Spontaneous_AUDIO_NPZ_DIR, pat)))
    # de-dup and sort
    paths = sorted(set(paths))
    print(f"Found {len(paths)} NPZ files to process.")

    encoder = None
    d_in_cached = None
    d_model = None

    for p in paths:
        base = os.path.splitext(os.path.basename(p))[0]
        out_path = os.path.join(PD_Spontaneous_OUT_DIR, f"{base.replace('_hubert_feats','').replace('_audio_feats','')}_h_audio_selfattn.npz")
        if os.path.exists(out_path):
            print(f"[skip] {os.path.basename(out_path)} (exists)")
            continue

        try:
            feats_np, mask_np = load_audio_npz(p)  # feats: [L,D] or [N,L,D]; mask: [L] or [N,L]
        except Exception as e:
            print(f"[err]  {os.path.basename(p)}: {e}")
            continue

        # standardize to torch
        feats = torch.from_numpy(feats_np).float()
        mask  = torch.from_numpy(mask_np).bool()

        # infer dimensions
        if feats.dim() == 2:
            L, d_in = feats.shape
            N = 1
        elif feats.dim() == 3:
            N, L, d_in = feats.shape
        else:
            print(f"[err]  {os.path.basename(p)}: unexpected feature shape {feats.shape}")
            continue

        # build encoder once (or rebuild if d_in changes across files)
        if d_in_cached != d_in:
            d_in_cached = d_in
            d_model = d_in if D_MODEL is None else int(D_MODEL)
            encoder = SelfAttentionAudioEncoder(
                d_in=d_in, d_model=d_model, n_heads=N_HEADS, n_layers=N_LAYERS, d_ff=D_FF, dropout=DROPOUT
            ).to(DEVICE).eval()
            print(f"[info] initialized encoder: d_in={d_in}, d_model={d_model}, heads={N_HEADS}, layers={N_LAYERS}")

        # run encoder
        with torch.no_grad():
            if feats.dim() == 2:
                h = encoder(feats.to(DEVICE), mask.to(DEVICE))                # [1, L, d_model]
                pooled = masked_mean(h, mask.unsqueeze(0).to(DEVICE)) if SAVE_POOLED else None  # [1, d_model]
                h_np   = h.cpu().numpy().astype(SAVE_DTYPE)                   # [1, L, d_model]
                m_np   = mask.cpu().numpy()                                   # [L]
                poolnp = None if pooled is None else pooled.cpu().numpy().astype(SAVE_DTYPE)  # [1, d_model]
            else:
                # [N, L, D] -> process in chunks along N
                h_chunks, pool_chunks = [], []
                for i in range(0, N, BATCH_N):
                    xb = feats[i:i+BATCH_N].to(DEVICE)                        # [b, L, d_in]
                    mb = mask[i:i+BATCH_N].to(DEVICE)                         # [b, L]
                    hb = encoder(xb, mb)                                      # [b, L, d_model]
                    h_chunks.append(hb.cpu())
                    if SAVE_POOLED:
                        pool_chunks.append(masked_mean(hb, mb).cpu())         # [b, d_model]
                h_np   = torch.cat(h_chunks, 0).numpy().astype(SAVE_DTYPE)    # [N, L, d_model]
                m_np   = mask.cpu().numpy()                                    # [N, L]
                poolnp = None if not SAVE_POOLED else torch.cat(pool_chunks, 0).numpy().astype(SAVE_DTYPE)  # [N, d_model]

        # save
        np.savez_compressed(
            out_path,
            h_audio=h_np,
            attention_mask=m_np,
            h_audio_pooled=poolnp,
            d_model=np.array(d_model),
            n_heads=np.array(N_HEADS),
            n_layers=np.array(N_LAYERS),
            source_audio_npz=os.path.basename(p),
        )
        print(f"[ok]   {os.path.basename(p)} -> h_audio {h_np.shape}" + (f", pooled {poolnp.shape}" if SAVE_POOLED else ""))

if __name__ == "__main__":
    main()


Found 15 NPZ files to process.
[info] initialized encoder: d_in=768, d_model=768, heads=8, layers=2
[ok]   ID02_pd_2_0_0_hubert_feats.npz -> h_audio (1, 9462, 768), pooled (1, 768)
[ok]   ID04_pd_2_0_1_hubert_feats.npz -> h_audio (1, 7340, 768), pooled (1, 768)
[ok]   ID06_pd_3_1_1_hubert_feats.npz -> h_audio (1, 6411, 768), pooled (1, 768)
[ok]   ID07_pd_2_0_0_hubert_feats.npz -> h_audio (1, 10457, 768), pooled (1, 768)
[ok]   ID13_pd_3_2_2_hubert_feats.npz -> h_audio (1, 9966, 768), pooled (1, 768)
[ok]   ID16_pd_2_0_0_hubert_feats.npz -> h_audio (1, 7824, 768), pooled (1, 768)
[ok]   ID17_pd_2_1_0_hubert_feats.npz -> h_audio (1, 7712, 768), pooled (1, 768)
[ok]   ID20_pd_3_0_1_hubert_feats.npz -> h_audio (1, 5511, 768), pooled (1, 768)
[ok]   ID24_pd_2_0_0_hubert_feats.npz -> h_audio (1, 6974, 768), pooled (1, 768)
[ok]   ID27_pd_4_1_1_hubert_feats.npz -> h_audio (1, 8609, 768), pooled (1, 768)
[ok]   ID29_pd_3_1_2_hubert_feats.npz -> h_audio (1, 7033, 768), pooled (1, 768)
[ok]   I

# Self Attention to get the h_graph


In [15]:
import math, torch, torch.nn as nn

class GraphlessGraphEncoder(nn.Module):
    def __init__(self, d_in, prefer_heads=(8, 6, 4, 3, 2, 1), d_model=None, n_heads=None,
                 n_layers=2, d_ff=2048, dropout=0.1):
        super().__init__()
        # Decide d_model and n_heads automatically if not provided
        if d_model is None and n_heads is None:
            # try to keep d_model=d_in and pick a head count that divides it
            for h in prefer_heads:
                if d_in % h == 0:
                    d_model, n_heads = d_in, h
                    break
            else:
                # no divisor found -> project to next multiple of top preference (e.g., 8)
                h = prefer_heads[0]
                d_model = int(math.ceil(d_in / h) * h)
                n_heads = h
        else:
            d_model = d_in if d_model is None else int(d_model)
            n_heads = 8 if n_heads is None else int(n_heads)
            if d_model % n_heads != 0:
                raise ValueError(f"d_model={d_model} must be divisible by n_heads={n_heads}")

        self.d_in    = d_in
        self.d_model = d_model
        self.n_heads = n_heads

        self.in_proj = nn.Linear(d_in, d_model) if d_in != d_model else nn.Identity()
        layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=d_ff,
            dropout=dropout, activation="gelu", batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
        self.out_norm = nn.LayerNorm(d_model)

    def forward(self, spec_tokens, mask=None):
        if spec_tokens.dim() == 2:  # [L,D] -> [1,L,D]
            spec_tokens = spec_tokens.unsqueeze(0)
        B, L, _ = spec_tokens.shape
        if mask is None:
            mask = torch.ones((B, L), dtype=torch.bool, device=spec_tokens.device)
        elif mask.dim() == 1:
            mask = mask.unsqueeze(0).to(spec_tokens.device)
        else:
            mask = mask.to(spec_tokens.device)

        x = self.in_proj(spec_tokens)
        kpm = ~mask.bool()
        x = self.encoder(x, src_key_padding_mask=kpm)
        x = self.out_norm(x)
        x = x.masked_fill(kpm.unsqueeze(-1), 0.0)

        m = mask.unsqueeze(-1).float()
        h_graph = (x * m).sum(1) / m.sum(1).clamp(min=1e-6)
        return x, h_graph


In [19]:
import os, glob, numpy as np, torch

HC_ReadText_Spectrogram_CLIP_NPY_DIR = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/CLIP_Spectrogram_embeddings"
HC_ReadText_Spectrogram_OUT_DIR      = os.path.join(HC_ReadText_Spectrogram_CLIP_NPY_DIR, "..", "h_graph_selfattn")
os.makedirs(HC_ReadText_Spectrogram_OUT_DIR, exist_ok=True)





device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = None
cached_d_in = None

for p in sorted(glob.glob(os.path.join(HC_ReadText_Spectrogram_CLIP_NPY_DIR, "*.npy"))):
    base = os.path.splitext(os.path.basename(p))[0]
    out_path = os.path.join(HC_ReadText_Spectrogram_OUT_DIR, f"{base}_h_graph_selfattn.npz")
    if os.path.exists(out_path):
        print("[skip]", os.path.basename(out_path)); continue

    feats = np.load(p)
    if feats.ndim == 1: feats = feats[None, :]
    if feats.ndim == 2: feats = feats[None, :, :]
    mask = np.ones(feats.shape[:-1], dtype=bool)

    x = torch.from_numpy(feats).float().to(device)    # [B, L, D_in]
    m = torch.from_numpy(mask).bool().to(device)
    d_in = x.shape[-1]
    if d_in != cached_d_in:
        cached_d_in = d_in
        encoder = GraphlessGraphEncoder(d_in=d_in, prefer_heads=(8,6,4,3,2,1), n_layers=2).to(device).eval()
        print(f"[init] encoder: d_in={d_in}, d_model={encoder.d_model}, heads={encoder.n_heads}")


    with torch.no_grad():
        h_nodes, h_vec = encoder(x, m)                # [B, L, D], [B, D]

    np.savez_compressed(
        out_path,
        h_graph_nodes=h_nodes.cpu().numpy().astype(np.float32),
        attention_mask=mask.astype(bool),
        h_graph=h_vec.cpu().numpy().astype(np.float32),
        d_model=np.array(encoder.d_model),
        source_clip_npy=os.path.basename(p),
    )
    print("[ok]", os.path.basename(p), "->", h_vec.shape)


[init] encoder: d_in=512, d_model=512, heads=8
[ok] HC_ReadText_Spectrogram_CLIP_features.npy -> torch.Size([1, 512])
[ok] HC_Spontaneous_Spectrogram_CLIP_features.npy -> torch.Size([1, 512])
[ok] PD_ReadText_Spectrogram_CLIP_features.npy -> torch.Size([1, 512])
[ok] PD_Spontaneous_Spectrogram_CLIP_features.npy -> torch.Size([1, 512])


