In [None]:
import json
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

In [None]:
with open("../data/parameters.json", 'r') as file:
    param = json.load(file)
nodes = pd.read_csv(os.path.join(param['files']['data_dir'], "nodes_snake.csv"), sep= ',')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("neuml/pubmedbert-base-embeddings")
model = AutoModel.from_pretrained("neuml/pubmedbert-base-embeddings")
model.eval()
model.to("cuda")  # or "cpu" if no GPU


def meanpooling(outputs, attention_mask):
    token_embeddings = outputs.last_hidden_state          # [B, T, H]
    mask = attention_mask.unsqueeze(-1).float()           # [B, T, 1]
    summed = (token_embeddings * mask).sum(dim=1)         # [B, H]
    counts = mask.sum(dim=1).clamp(min=1e-9)              # [B, 1]
    return summed / counts


def node_embedding(list_nodes_sentence, batch_size=32):
    all_embs = []

    for start in range(0, len(list_nodes_sentence), batch_size):
        batch = list_nodes_sentence[start : start + batch_size]

        inputs = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(model.device)

        with torch.no_grad():
            outputs = model(**inputs)

        emb = meanpooling(outputs, inputs["attention_mask"])
        all_embs.append(emb.cpu())

    return torch.cat(all_embs, dim=0)   # [N, 768]


# ------ your data ------
# list_nodes_sentence: list[str], one text per node
# all_node_names: list[str], same order as list_nodes_sentence

emb_all = node_embedding(list_nodes_sentence, batch_size=32)  # [N, 768]

# save embeddings + node names + index mapping
torch.save(emb_all, "node_emb_all.pt")
with open("all_node_names.json", "w") as f:
    json.dump(all_node_names, f)
name2id = {name: i for i, name in enumerate(all_node_names)}
with open("name2id.json", "w") as f:
    json.dump(name2id, f)
