In [3]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# -------------------------------------------------------
# SETTINGS
# -------------------------------------------------------
MODEL_NAME = "answerdotai/ModernBERT-base"  
CSV_PATH = "../bm25/travel_blogs.csv"
OUTPUT_EMB_PATH = "./travel_blog_embeddings.pt"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------------------------------------
# LOAD MODEL + TOKENIZER
# -------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

# -------------------------------------------------------
# LOAD DATA
# -------------------------------------------------------
df = pd.read_csv(CSV_PATH)

# We will embed the "content" field
texts = df["content"].fillna("").tolist()

# -------------------------------------------------------
# EMBEDDING FUNCTION (ModernBERT mean pooling)
# -------------------------------------------------------
def embed_text(text):
    # Tokenize
    encoded = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(DEVICE)

    # Forward pass
    with torch.no_grad():
        outputs = model(**encoded)

    # ModernBERT uses last_hidden_state for embeddings
    last_hidden = outputs.last_hidden_state   # [1, seq_len, hidden_size]

    # Mean pooling (standard for dense retrieval embeddings)
    attention_mask = encoded["attention_mask"].unsqueeze(-1)
    sum_embeddings = torch.sum(last_hidden * attention_mask, dim=1)
    sum_mask = torch.sum(attention_mask, dim=1)
    embedding = sum_embeddings / sum_mask

    return embedding.squeeze(0).cpu()    # Vector shape = (hidden_size,)

  from .autonotebook import tqdm as notebook_tqdm
  from scipy.sparse import csr_matrix, issparse
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [4]:

# -------------------------------------------------------
# LOOP THROUGH BLOG CONTENT AND EMBED
# -------------------------------------------------------
embeddings = []
for text in tqdm(texts, desc="Embedding travel blogs"):
    emb = embed_text(text)
    embeddings.append(emb)

# Stack into tensor
embeddings_tensor = torch.stack(embeddings)

# -------------------------------------------------------
# SAVE EMBEDDINGS
# -------------------------------------------------------
torch.save({
    "embeddings": embeddings_tensor,
    "ids": df["id"].tolist(),
}, OUTPUT_EMB_PATH)

print("Saved embeddings to:", OUTPUT_EMB_PATH)
print("Embedding matrix shape:", embeddings_tensor.shape)


Embedding travel blogs: 100%|██████████| 7441/7441 [2:23:38<00:00,  1.16s/it]  


Saved embeddings to: ./travel_blog_embeddings.pt
Embedding matrix shape: torch.Size([7441, 768])


In [11]:
import faiss
import torch

data = torch.load("travel_blog_embeddings.pt", weights_only=True)
emb = data["embeddings"]  # shape (N, 768)

index = faiss.IndexFlatL2(emb.shape[1])
index.add(emb.numpy())

In [12]:
def search_blogs(query, k=5):
    q_emb = embed_text(query).unsqueeze(0).numpy()
    distances, idxs = index.search(q_emb, k)
    rows = df.iloc[idxs[0]].copy()
    rows["distance"] = distances[0]
    return rows