In [1]:
import torch
import numpy as np
import librosa
import PIL.Image as Image
import matplotlib.pyplot as plt
import clip
import os

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess= clip.load("ViT-B/32", device=device)

In [14]:
folder_path = '/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Spectrogram/HC_ReadText_Spectrogram'
out_dir = os.path.join("/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi", "HC_ReadText_Spectrogram_embeddings")
model_name = "ViT-B/32"
Batch_size = 64
Flip_low_freq_bottom = True
Target_size = 224
save_features = "HC_ReadText_Spectrogram_CLIP_features.npy"
save_filenames = "HC_ReadText_Spectrogram_CLIP_filenames.txt"  # Fixed typo here


os.makedirs(out_dir, exist_ok=True)
model.eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [15]:
import glob

def spectrogram_to_pil_rgb(spec: np.ndarray) -> Image.Image:
    if spec.ndim == 3 and spec.shape[0] == 1:
        spec = spec[0]
    elif spec.ndim !=2:
        spec = np.squeeze(spec)
        assert spec.ndim == 2, f"Expected 2D array, got shape {spec.shape}"

    # Min-max normalize to [0, 255] per file
    s_min, s_max = float(np.min(spec)), float(np.max(spec))
    if s_max - s_min < 1e-12:
        # Avoid division by zero if the spectrogram is constant
        img = np.zeros_like(spec, dtype=np.uint8)
    else:
        img = (255.0 * (spec - s_min) / (s_max - s_min)).astype(np.uint8)
        
    # Resize to something near CLIP native size; preprocess will center-crop/pad as needed
    pil = Image.fromarray(img, mode="L")  # grayscale
    if pil.size[0] != Target_size or pil.size[1] != Target_size:
        pil = pil.resize((Target_size, Target_size), resample=Image.BICUBIC)

    # Convert to 3-channel RGB by replication (simple & effective)
    pil = pil.convert("RGB")
    return pil

def chunked(iterable, n):
    """Yield successive n-sized chunks from iterable."""
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]


files = sorted(glob.glob(os.path.join(folder_path, "*.npy")))
if not files:
    raise FileNotFoundError(f"No .npy files found under: {folder_path}")

all_features = []
all_filenames = []

with torch.no_grad():
    for batch in chunked(files, Batch_size):
    
        pil_imgs = []
        for f in batch:
            spec = np.load(f)
            pil_imgs.append(spectrogram_to_pil_rgb(spec))
            all_filenames.append(os.path.basename(f))
        
        
    #Preprocess to tensor
    image_tensors = torch.stack([preprocess(img) for img in pil_imgs], dim=0).to(device)

    features = model.encode_image(image_tensors)
   
    features = features / features.norm(dim=-1, keepdim=True)
    all_features.append(features.cpu())

all_features = torch.cat(all_features, dim=0).numpy()
np.save(save_features, all_features)

with open(save_filenames, 'w') as f:
    f.write("\n".join(all_filenames))

print(f"Done. Saved {all_features.shape[0]} embeddings of dim {all_features.shape[1]}.")
print(f"- Features: {save_features}")
print(f"- Filenames: {save_filenames}")

  pil = Image.fromarray(img, mode="L")  # grayscale


Done. Saved 21 embeddings of dim 512.
- Features: HC_ReadText_Spectrogram_CLIP_features.npy
- Filenames: HC_ReadText_Spectrogram_CLIP_filenames.txt


In [16]:
folder_path = '/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Spectrogram/PD_ReadText_Spectrogram'
save_features = "PD_ReadText_Spectrogram_CLIP_features.npy"
save_filenames = "PD_ReadText_Spectrogram_CLIP_filenames.txt"  # Fixed typo here

In [17]:
import glob

def spectrogram_to_pil_rgb(spec: np.ndarray) -> Image.Image:
    if spec.ndim == 3 and spec.shape[0] == 1:
        spec = spec[0]
    elif spec.ndim !=2:
        spec = np.squeeze(spec)
        assert spec.ndim == 2, f"Expected 2D array, got shape {spec.shape}"

    # Min-max normalize to [0, 255] per file
    s_min, s_max = float(np.min(spec)), float(np.max(spec))
    if s_max - s_min < 1e-12:
        # Avoid division by zero if the spectrogram is constant
        img = np.zeros_like(spec, dtype=np.uint8)
    else:
        img = (255.0 * (spec - s_min) / (s_max - s_min)).astype(np.uint8)
        
    # Resize to something near CLIP native size; preprocess will center-crop/pad as needed
    pil = Image.fromarray(img, mode="L")  # grayscale
    if pil.size[0] != Target_size or pil.size[1] != Target_size:
        pil = pil.resize((Target_size, Target_size), resample=Image.BICUBIC)

    # Convert to 3-channel RGB by replication (simple & effective)
    pil = pil.convert("RGB")
    return pil

def chunked(iterable, n):
    """Yield successive n-sized chunks from iterable."""
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]


files = sorted(glob.glob(os.path.join(folder_path, "*.npy")))
if not files:
    raise FileNotFoundError(f"No .npy files found under: {folder_path}")

all_features = []
all_filenames = []

with torch.no_grad():
    for batch in chunked(files, Batch_size):
    
        pil_imgs = []
        for f in batch:
            spec = np.load(f)
            pil_imgs.append(spectrogram_to_pil_rgb(spec))
            all_filenames.append(os.path.basename(f))
        
        
    #Preprocess to tensor
    image_tensors = torch.stack([preprocess(img) for img in pil_imgs], dim=0).to(device)

    features = model.encode_image(image_tensors)
   
    features = features / features.norm(dim=-1, keepdim=True)
    all_features.append(features.cpu())

all_features = torch.cat(all_features, dim=0).numpy()
np.save(save_features, all_features)

with open(save_filenames, 'w') as f:
    f.write("\n".join(all_filenames))

print(f"Done. Saved {all_features.shape[0]} embeddings of dim {all_features.shape[1]}.")
print(f"- Features: {save_features}")
print(f"- Filenames: {save_filenames}")

  pil = Image.fromarray(img, mode="L")  # grayscale


Done. Saved 16 embeddings of dim 512.
- Features: PD_ReadText_Spectrogram_CLIP_features.npy
- Filenames: PD_ReadText_Spectrogram_CLIP_filenames.txt


In [18]:
folder_path = '/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Spectrogram/HC_Spontaneous_Spectrogram'
save_features = "HC_Spontaneous_Spectrogram_CLIP_features.npy"
save_filenames = "HC_Spontaneous_Spectrogram_CLIP_filenames.txt"  # Fixed typo here

In [19]:
import glob

def spectrogram_to_pil_rgb(spec: np.ndarray) -> Image.Image:
    if spec.ndim == 3 and spec.shape[0] == 1:
        spec = spec[0]
    elif spec.ndim !=2:
        spec = np.squeeze(spec)
        assert spec.ndim == 2, f"Expected 2D array, got shape {spec.shape}"

    # Min-max normalize to [0, 255] per file
    s_min, s_max = float(np.min(spec)), float(np.max(spec))
    if s_max - s_min < 1e-12:
        # Avoid division by zero if the spectrogram is constant
        img = np.zeros_like(spec, dtype=np.uint8)
    else:
        img = (255.0 * (spec - s_min) / (s_max - s_min)).astype(np.uint8)
        
    # Resize to something near CLIP native size; preprocess will center-crop/pad as needed
    pil = Image.fromarray(img, mode="L")  # grayscale
    if pil.size[0] != Target_size or pil.size[1] != Target_size:
        pil = pil.resize((Target_size, Target_size), resample=Image.BICUBIC)

    # Convert to 3-channel RGB by replication (simple & effective)
    pil = pil.convert("RGB")
    return pil

def chunked(iterable, n):
    """Yield successive n-sized chunks from iterable."""
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]


files = sorted(glob.glob(os.path.join(folder_path, "*.npy")))
if not files:
    raise FileNotFoundError(f"No .npy files found under: {folder_path}")

all_features = []
all_filenames = []

with torch.no_grad():
    for batch in chunked(files, Batch_size):
    
        pil_imgs = []
        for f in batch:
            spec = np.load(f)
            pil_imgs.append(spectrogram_to_pil_rgb(spec))
            all_filenames.append(os.path.basename(f))
        
        
    #Preprocess to tensor
    image_tensors = torch.stack([preprocess(img) for img in pil_imgs], dim=0).to(device)

    features = model.encode_image(image_tensors)
   
    features = features / features.norm(dim=-1, keepdim=True)
    all_features.append(features.cpu())

all_features = torch.cat(all_features, dim=0).numpy()
np.save(save_features, all_features)

with open(save_filenames, 'w') as f:
    f.write("\n".join(all_filenames))

print(f"Done. Saved {all_features.shape[0]} embeddings of dim {all_features.shape[1]}.")
print(f"- Features: {save_features}")
print(f"- Filenames: {save_filenames}")

  pil = Image.fromarray(img, mode="L")  # grayscale


Done. Saved 21 embeddings of dim 512.
- Features: HC_Spontaneous_Spectrogram_CLIP_features.npy
- Filenames: HC_Spontaneous_Spectrogram_CLIP_filenames.txt


In [20]:
folder_path = '/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Spectrogram/PD_Spontaneous_Spectrogram'
save_features = "PD_Spontaneous_Spectrogram_CLIP_features.npy"
save_filenames = "PD_Spontaneous_Spectrogram_CLIP_filenames.txt"  # Fixed typo here

In [21]:
import glob

def spectrogram_to_pil_rgb(spec: np.ndarray) -> Image.Image:
    if spec.ndim == 3 and spec.shape[0] == 1:
        spec = spec[0]
    elif spec.ndim !=2:
        spec = np.squeeze(spec)
        assert spec.ndim == 2, f"Expected 2D array, got shape {spec.shape}"

    # Min-max normalize to [0, 255] per file
    s_min, s_max = float(np.min(spec)), float(np.max(spec))
    if s_max - s_min < 1e-12:
        # Avoid division by zero if the spectrogram is constant
        img = np.zeros_like(spec, dtype=np.uint8)
    else:
        img = (255.0 * (spec - s_min) / (s_max - s_min)).astype(np.uint8)
        
    # Resize to something near CLIP native size; preprocess will center-crop/pad as needed
    pil = Image.fromarray(img, mode="L")  # grayscale
    if pil.size[0] != Target_size or pil.size[1] != Target_size:
        pil = pil.resize((Target_size, Target_size), resample=Image.BICUBIC)

    # Convert to 3-channel RGB by replication (simple & effective)
    pil = pil.convert("RGB")
    return pil

def chunked(iterable, n):
    """Yield successive n-sized chunks from iterable."""
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]


files = sorted(glob.glob(os.path.join(folder_path, "*.npy")))
if not files:
    raise FileNotFoundError(f"No .npy files found under: {folder_path}")

all_features = []
all_filenames = []

with torch.no_grad():
    for batch in chunked(files, Batch_size):
    
        pil_imgs = []
        for f in batch:
            spec = np.load(f)
            pil_imgs.append(spectrogram_to_pil_rgb(spec))
            all_filenames.append(os.path.basename(f))
        
        
    #Preprocess to tensor
    image_tensors = torch.stack([preprocess(img) for img in pil_imgs], dim=0).to(device)

    features = model.encode_image(image_tensors)
   
    features = features / features.norm(dim=-1, keepdim=True)
    all_features.append(features.cpu())

all_features = torch.cat(all_features, dim=0).numpy()
np.save(save_features, all_features)

with open(save_filenames, 'w') as f:
    f.write("\n".join(all_filenames))

print(f"Done. Saved {all_features.shape[0]} embeddings of dim {all_features.shape[1]}.")
print(f"- Features: {save_features}")
print(f"- Filenames: {save_filenames}")

  pil = Image.fromarray(img, mode="L")  # grayscale


Done. Saved 15 embeddings of dim 512.
- Features: PD_Spontaneous_Spectrogram_CLIP_features.npy
- Filenames: PD_Spontaneous_Spectrogram_CLIP_filenames.txt
