In [3]:
import os, glob
import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import re

In [4]:
# --- Config ---
MODEL_NAME = "bert-base-uncased"
TEXT_DIR   = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Transcript/HC_ReadText_transcriptions"
OUT_DIR    = os.path.join("/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi", "berts_feats_tokens_only")
MAX_LENGTH = 512            # fixed length so arrays save cleanly
BATCH_SIZE = 16
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAVE_DTYPE = np.float32     # switch to np.float16 to save disk

os.makedirs(OUT_DIR, exist_ok=True)

# --- Load frozen BERT ---
tok  = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
bert = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE).eval()
for p in bert.parameters():
    p.requires_grad = False

def split_sentences(text: str):
    s = re.split(r'(?<=[.!?])\s+', str(text).strip())
    return [x.strip() for x in s if x and not x.isspace()] or [str(text).strip()]

@torch.no_grad()
def encode_batch(sentences):
    enc = tok(
        sentences,
        return_tensors="pt",
        padding="max_length",   # -> [B, MAX_LENGTH]
        truncation=True,
        max_length=MAX_LENGTH,
    )
    enc = {k: v.to(DEVICE) for k, v in enc.items()}
    last_hidden = bert(**enc).last_hidden_state   # [B, MAX_LENGTH, 768]
    return last_hidden.cpu(), enc["attention_mask"].cpu()  # mask: [B, MAX_LENGTH]

files = sorted(glob.glob(os.path.join(TEXT_DIR, "*.txt")))
print(f"Found {len(files)} text files.")

for path in tqdm(files, desc="BERT token embeddings"):
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    sents = split_sentences(text)

    token_blocks, mask_blocks = [], []
    for i in range(0, len(sents), BATCH_SIZE):
        tok_emb, mask = encode_batch(sents[i:i+BATCH_SIZE])
        token_blocks.append(tok_emb)
        mask_blocks.append(mask)

    tokens = torch.cat(token_blocks, dim=0).numpy().astype(SAVE_DTYPE)  # [N, MAX_LENGTH, 768]
    masks  = torch.cat(mask_blocks, dim=0).numpy()                      # [N, MAX_LENGTH]

    base = os.path.splitext(os.path.basename(path))[0]
    np.savez_compressed(
        os.path.join(OUT_DIR, f"{base}_tokens_for_selfattn.npz"),
        token_embeddings=tokens,
        attention_mask=masks,
        model=np.array(MODEL_NAME),
        max_length=np.array(MAX_LENGTH),
    )


Found 21 text files.


BERT token embeddings: 100%|██████████| 21/21 [00:38<00:00,  1.82s/it]


In [5]:
# --- Config ---
MODEL_NAME = "bert-base-uncased"
TEXT_DIR   = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Transcript/PD_ReadText_transcriptions"
OUT_DIR    = os.path.join("/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi", "PD_ReadText_berts_feats_tokens_only")
MAX_LENGTH = 512            # fixed length so arrays save cleanly
BATCH_SIZE = 16
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAVE_DTYPE = np.float32     # switch to np.float16 to save disk

os.makedirs(OUT_DIR, exist_ok=True)

# --- Load frozen BERT ---
tok  = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
bert = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE).eval()
for p in bert.parameters():
    p.requires_grad = False

def split_sentences(text: str):
    s = re.split(r'(?<=[.!?])\s+', str(text).strip())
    return [x.strip() for x in s if x and not x.isspace()] or [str(text).strip()]

@torch.no_grad()
def encode_batch(sentences):
    enc = tok(
        sentences,
        return_tensors="pt",
        padding="max_length",   # -> [B, MAX_LENGTH]
        truncation=True,
        max_length=MAX_LENGTH,
    )
    enc = {k: v.to(DEVICE) for k, v in enc.items()}
    last_hidden = bert(**enc).last_hidden_state   # [B, MAX_LENGTH, 768]
    return last_hidden.cpu(), enc["attention_mask"].cpu()  # mask: [B, MAX_LENGTH]

files = sorted(glob.glob(os.path.join(TEXT_DIR, "*.txt")))
print(f"Found {len(files)} text files.")

for path in tqdm(files, desc="BERT token embeddings"):
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    sents = split_sentences(text)

    token_blocks, mask_blocks = [], []
    for i in range(0, len(sents), BATCH_SIZE):
        tok_emb, mask = encode_batch(sents[i:i+BATCH_SIZE])
        token_blocks.append(tok_emb)
        mask_blocks.append(mask)

    tokens = torch.cat(token_blocks, dim=0).numpy().astype(SAVE_DTYPE)  # [N, MAX_LENGTH, 768]
    masks  = torch.cat(mask_blocks, dim=0).numpy()                      # [N, MAX_LENGTH]

    base = os.path.splitext(os.path.basename(path))[0]
    np.savez_compressed(
        os.path.join(OUT_DIR, f"{base}_tokens_for_selfattn.npz"),
        token_embeddings=tokens,
        attention_mask=masks,
        model=np.array(MODEL_NAME),
        max_length=np.array(MAX_LENGTH),
    )


Found 16 text files.


BERT token embeddings: 100%|██████████| 16/16 [00:20<00:00,  1.28s/it]


In [7]:
# --- Config ---
MODEL_NAME = "bert-base-uncased"
TEXT_DIR   = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Transcript/PD_Spontaneous_transcriptions"
OUT_DIR    = os.path.join("/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi", "PD_Spontaneous_berts_feats_tokens_only")
MAX_LENGTH = 512            # fixed length so arrays save cleanly
BATCH_SIZE = 16
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAVE_DTYPE = np.float32     # switch to np.float16 to save disk

os.makedirs(OUT_DIR, exist_ok=True)

# --- Load frozen BERT ---
tok  = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
bert = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE).eval()
for p in bert.parameters():
    p.requires_grad = False

def split_sentences(text: str):
    s = re.split(r'(?<=[.!?])\s+', str(text).strip())
    return [x.strip() for x in s if x and not x.isspace()] or [str(text).strip()]

@torch.no_grad()
def encode_batch(sentences):
    enc = tok(
        sentences,
        return_tensors="pt",
        padding="max_length",   # -> [B, MAX_LENGTH]
        truncation=True,
        max_length=MAX_LENGTH,
    )
    enc = {k: v.to(DEVICE) for k, v in enc.items()}
    last_hidden = bert(**enc).last_hidden_state   # [B, MAX_LENGTH, 768]
    return last_hidden.cpu(), enc["attention_mask"].cpu()  # mask: [B, MAX_LENGTH]

files = sorted(glob.glob(os.path.join(TEXT_DIR, "*.txt")))
print(f"Found {len(files)} text files.")

for path in tqdm(files, desc="BERT token embeddings"):
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    sents = split_sentences(text)

    token_blocks, mask_blocks = [], []
    for i in range(0, len(sents), BATCH_SIZE):
        tok_emb, mask = encode_batch(sents[i:i+BATCH_SIZE])
        token_blocks.append(tok_emb)
        mask_blocks.append(mask)

    tokens = torch.cat(token_blocks, dim=0).numpy().astype(SAVE_DTYPE)  # [N, MAX_LENGTH, 768]
    masks  = torch.cat(mask_blocks, dim=0).numpy()                      # [N, MAX_LENGTH]

    base = os.path.splitext(os.path.basename(path))[0]
    np.savez_compressed(
        os.path.join(OUT_DIR, f"{base}_tokens_for_selfattn.npz"),
        token_embeddings=tokens,
        attention_mask=masks,
        model=np.array(MODEL_NAME),
        max_length=np.array(MAX_LENGTH),
    )


Found 15 text files.


BERT token embeddings: 100%|██████████| 15/15 [00:33<00:00,  2.25s/it]


In [8]:
# --- Config ---
MODEL_NAME = "bert-base-uncased"
TEXT_DIR   = "/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi/Transcript/HC_Spontaneous_transcriptions"
OUT_DIR    = os.path.join("/mnt/d/Roshidat_Msc_Project/Audio_parkinson/pd&Hc_multi", "HC_Spontaneous_berts_feats_tokens_only")
MAX_LENGTH = 512            # fixed length so arrays save cleanly
BATCH_SIZE = 16
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAVE_DTYPE = np.float32     # switch to np.float16 to save disk

os.makedirs(OUT_DIR, exist_ok=True)

# --- Load frozen BERT ---
tok  = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
bert = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE).eval()
for p in bert.parameters():
    p.requires_grad = False

def split_sentences(text: str):
    s = re.split(r'(?<=[.!?])\s+', str(text).strip())
    return [x.strip() for x in s if x and not x.isspace()] or [str(text).strip()]

@torch.no_grad()
def encode_batch(sentences):
    enc = tok(
        sentences,
        return_tensors="pt",
        padding="max_length",   # -> [B, MAX_LENGTH]
        truncation=True,
        max_length=MAX_LENGTH,
    )
    enc = {k: v.to(DEVICE) for k, v in enc.items()}
    last_hidden = bert(**enc).last_hidden_state   # [B, MAX_LENGTH, 768]
    return last_hidden.cpu(), enc["attention_mask"].cpu()  # mask: [B, MAX_LENGTH]

files = sorted(glob.glob(os.path.join(TEXT_DIR, "*.txt")))
print(f"Found {len(files)} text files.")

for path in tqdm(files, desc="BERT token embeddings"):
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
    sents = split_sentences(text)

    token_blocks, mask_blocks = [], []
    for i in range(0, len(sents), BATCH_SIZE):
        tok_emb, mask = encode_batch(sents[i:i+BATCH_SIZE])
        token_blocks.append(tok_emb)
        mask_blocks.append(mask)

    tokens = torch.cat(token_blocks, dim=0).numpy().astype(SAVE_DTYPE)  # [N, MAX_LENGTH, 768]
    masks  = torch.cat(mask_blocks, dim=0).numpy()                      # [N, MAX_LENGTH]

    base = os.path.splitext(os.path.basename(path))[0]
    np.savez_compressed(
        os.path.join(OUT_DIR, f"{base}_tokens_for_selfattn.npz"),
        token_embeddings=tokens,
        attention_mask=masks,
        model=np.array(MODEL_NAME),
        max_length=np.array(MAX_LENGTH),
    )


Found 21 text files.


BERT token embeddings: 100%|██████████| 21/21 [01:01<00:00,  2.94s/it]
