In [1]:
import os, glob
import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
Model_name = "bert-base-uncased"
text_dir = "/home/jovyan/Desktop/PD_LLM/data/voice_parkinson/HC_AH/HC_AH_TEXT_output"
out_Dir = os.path.join(text_dir, "berts_feats")
max_length = 512
batch_size = 16
device = "cuda" if torch.cuda.is_available() else "cpu"
save_tokens = False

os.makedirs(out_Dir, exist_ok=True)

files = sorted(glob.glob(os.path.join(text_dir, "*.txt")))
print(f"Found {len(files)} text files.")

Found 41 text files.


In [3]:
tokenizer = AutoTokenizer.from_pretrained(Model_name, token=None)
model = AutoModel.from_pretrained(Model_name, token=None)
model.to(device)
model.eval()

def read_texts(paths):
    texts = []
    for p in paths:
        with open(p, "r", encoding="utf-8", errors="ignore") as f:
            texts.append(f.read())
    return texts

cls_vectors = []
mean_vectors = []

for i in tqdm(range(0, len(files), batch_size)):
    batch_paths = files[i:i + batch_size]
    texts = read_texts(batch_paths)

    enc = tokenizer(
        texts,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length = max_length,
    )

    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        outputs = model(**enc)
      
        last_hidden_state = outputs.last_hidden_state  # (B, T, D)

        cls = last_hidden_state[:, 0, :]

        mask = enc["attention_mask"].unsqueeze(-1)
        summed = (last_hidden_state * mask).sum(dim=1)
        counts = mask.sum(dim=1).clamp(min=1)
        mean = summed / counts
        
        cls_vectors.append(cls.cpu().numpy())
        mean_vectors.append(mean.cpu().numpy())
        
        
        if save_tokens:
            
            for j, p in enumerate(batch_paths):
                L = int(enc["attention_mask"][j].sum().item())
                tokenizer_features = last_hidden_state[j, :L, :].cpu().numpy()
                base = os.path.splitext(os.path.basename(p))[0]
                np.save(os.path.join(out_Dir, f"{base}.tokens.last.npy"), tokenizer_features)

cls_vectors = np.vstack(cls_vectors)
mean_vectors = np.vstack(mean_vectors)
np.save(os.path.join(out_Dir, "cls_vectors.npy"), cls_vectors)
np.save(os.path.join(out_Dir, "mean_vectors.npy"), mean_vectors)


print(os.path.join(out_Dir, "cls_vectors.npy"), cls_vectors.shape)
print(os.path.join(out_Dir, "mean_vectors.npy"), mean_vectors.shape)


  [2m2025-08-25T07:50:51.342027Z[0m [33m WARN[0m  [33mReqwest(reqwest::Error { kind: Request, url: "https://cas-server.xethub.hf.co/reconstruction/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725", source: hyper_util::client::legacy::Error(Connect, ConnectError("dns error", Custom { kind: Uncategorized, error: "failed to lookup address information: No address associated with hostname" })) }). Retrying...[0m
    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:213

  [2m2025-08-25T07:50:51.342114Z[0m [33m WARN[0m  [33mRetry attempt #0. Sleeping 1.178943499s before the next attempt[0m
    [2;3mat[0m /root/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-retry-0.7.0/src/middleware.rs:171

  [2m2025-08-25T07:50:52.562076Z[0m [33m WARN[0m  [33mReqwest(reqwest::Error { kind: Request, url: "https://cas-server.xethub.hf.co/reconstruction/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725", source: hyper_

100%|██████████| 3/3 [00:00<00:00,  5.69it/s]

/home/jovyan/Desktop/PD_LLM/data/voice_parkinson/HC_AH/HC_AH_TEXT_output/berts_feats/cls_vectors.npy (41, 768)
/home/jovyan/Desktop/PD_LLM/data/voice_parkinson/HC_AH/HC_AH_TEXT_output/berts_feats/mean_vectors.npy (41, 768)





In [8]:
pd_text_dir = "/home/jovyan/Desktop/PD_LLM/data/voice_parkinson/PD_AH/PD_AH_Text_output"
pd_out_Dir = os.path.join(pd_text_dir, "berts_feats")
max_length = 512
batch_size = 16
device = "cuda" if torch.cuda.is_available() else "cpu"
save_tokens = False

os.makedirs(pd_out_Dir, exist_ok=True)

files = sorted(glob.glob(os.path.join(pd_text_dir, "*.txt")))
print(f"Found {len(files)} text files.")

Found 40 text files.


In [9]:
tokenizer = AutoTokenizer.from_pretrained(Model_name, token=None)
model = AutoModel.from_pretrained(Model_name, token=None)
model.to(device)
model.eval()

def read_texts(paths):
    texts = []
    for p in paths:
        with open(p, "r", encoding="utf-8", errors="ignore") as f:
            texts.append(f.read())
    return texts

cls_vectors = []
mean_vectors = []

for i in tqdm(range(0, len(files), batch_size)):
    batch_paths = files[i:i + batch_size]
    texts = read_texts(batch_paths)

    enc = tokenizer(
        texts,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length = max_length,
    )

    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        outputs = model(**enc)
      
        last_hidden_state = outputs.last_hidden_state  # (B, T, D)

        cls = last_hidden_state[:, 0, :]

        mask = enc["attention_mask"].unsqueeze(-1)
        summed = (last_hidden_state * mask).sum(dim=1)
        counts = mask.sum(dim=1).clamp(min=1)
        mean = summed / counts
        
        cls_vectors.append(cls.cpu().numpy())
        mean_vectors.append(mean.cpu().numpy())
        
        
        if save_tokens:
            
            for j, p in enumerate(batch_paths):
                L = int(enc["attention_mask"][j].sum().item())
                tokenizer_features = last_hidden_state[j, :L, :].cpu().numpy()
                base = os.path.splitext(os.path.basename(p))[0]
                np.save(os.path.join(out_Dir, f"{base}.tokens.last.npy"), tokenizer_features)

cls_vectors = np.vstack(cls_vectors)
mean_vectors = np.vstack(mean_vectors)
np.save(os.path.join(out_Dir, "cls_vectors.npy"), cls_vectors)
np.save(os.path.join(out_Dir, "mean_vectors.npy"), mean_vectors)


print(os.path.join(out_Dir, "cls_vectors.npy"), cls_vectors.shape)
print(os.path.join(out_Dir, "mean_vectors.npy"), mean_vectors.shape)


100%|██████████| 3/3 [00:00<00:00, 43.92it/s]

/home/jovyan/Desktop/PD_LLM/data/voice_parkinson/HC_AH/HC_AH_TEXT_output/berts_feats/cls_vectors.npy (40, 768)
/home/jovyan/Desktop/PD_LLM/data/voice_parkinson/HC_AH/HC_AH_TEXT_output/berts_feats/mean_vectors.npy (40, 768)



