In [1]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# -------------------------------------------------------
# SETTINGS
# -------------------------------------------------------
# OLD: MODEL_NAME = "answerdotai/ModernBERT-base"
# NEW: Use a model optimized for semantic search
MODEL_NAME = "nomic-ai/modernbert-embed-base" 
CSV_PATH = "../bm25/travel_blogs.csv"
OUTPUT_EMB_PATH = "./travel_blog_embeddings.pt"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------------------------------------
# LOAD MODEL + TOKENIZER
# -------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

# -------------------------------------------------------
# LOAD DATA
# -------------------------------------------------------
df = pd.read_csv(CSV_PATH)

# We will embed the "content" field
texts = df["content"].fillna("").tolist()

# -------------------------------------------------------
# EMBEDDING FUNCTION (Batch optimized)
# -------------------------------------------------------
def embed_texts(texts_batch):
    # Tokenize (handles list of strings automatically)
    encoded = tokenizer(
        texts_batch,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(DEVICE)

    # Forward pass
    with torch.no_grad():
        outputs = model(**encoded)

    # ModernBERT uses last_hidden_state for embeddings
    last_hidden = outputs.last_hidden_state

    # Mean pooling
    attention_mask = encoded["attention_mask"].unsqueeze(-1)
    sum_embeddings = torch.sum(last_hidden * attention_mask, dim=1)
    sum_mask = torch.sum(attention_mask, dim=1)
    # Clamp sum_mask to avoid division by zero
    sum_mask = torch.clamp(sum_mask, min=1e-9)
    embedding = sum_embeddings / sum_mask
    
    # Normalize embeddings to unit length
    embedding = torch.nn.functional.normalize(embedding, p=2, dim=1)

    return embedding.cpu()

2025-12-01 18:13:22.487912: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-01 18:13:22.702184: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-01 18:13:24.546313: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


model.safetensors:   0%|          | 0.00/596M [00:00<?, ?B/s]

In [2]:

# -------------------------------------------------------
# LOOP THROUGH BLOG CONTENT AND EMBED
# -------------------------------------------------------
embeddings = []
BATCH_SIZE = 32  # Adjust based on VRAM (16, 32, 64)

# Process in batches
for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Embedding travel blogs"):
    batch_texts = texts[i : i + BATCH_SIZE]
    # Ensure batch is not empty
    if not batch_texts:
        continue
    batch_emb = embed_texts(batch_texts)
    embeddings.append(batch_emb)

# Stack into tensor (use cat because embed_texts returns a batch tensor)
embeddings_tensor = torch.cat(embeddings)

# -------------------------------------------------------
# SAVE EMBEDDINGS
# -------------------------------------------------------
torch.save({
    "embeddings": embeddings_tensor,
    "ids": df["id"].tolist(),
}, OUTPUT_EMB_PATH)

print("Saved embeddings to:", OUTPUT_EMB_PATH)
print("Embedding matrix shape:", embeddings_tensor.shape)


W1201 18:14:33.971000 39262 torch/_inductor/utils.py:1436] [1/0_1] Not enough SMs to use max_autotune_gemm mode
Embedding travel blogs: 100%|██████████| 233/233 [03:19<00:00,  1.17it/s]

Saved embeddings to: ./travel_blog_embeddings.pt
Embedding matrix shape: torch.Size([7441, 768])





In [8]:
import faiss
import torch

data = torch.load("travel_blog_embeddings.pt", weights_only=True)
emb = data["embeddings"]  # shape (N, 768)

index = faiss.IndexFlatL2(emb.shape[1])
index.add(emb.numpy())

In [9]:
def search_blogs(query, k=5):
    # filepath: /home/nadav/dsan/dsan6700/dsan6700_app_dev_project/backend/bert/testing.ipynb
    # Pass as a list [query]
    q_emb = embed_texts([query]).numpy()
    distances, idxs = index.search(q_emb, k)
    rows = df.iloc[idxs[0]].copy()
    rows["distance"] = distances[0]
    return rows

In [16]:
query = "mountains in europe"
results = search_blogs(query)

print(f"Search results for: '{query}'\n")
for i in range(len(results)):
    row = results.iloc[i]
    print(f"Rank {i+1} (Distance: {row['distance']:.4f})")
    
    # Use correct column names from your dataframe
    if 'page_title' in row:
        print(f"Title: {row['page_title']}")
    if 'location_name' in row:
        print(f"Location: {row['location_name']}")
        
    # Print a longer snippet of the content
    content_snippet = str(row['content'])[:300].replace('\n', ' ')
    print(f"Content Snippet: {content_snippet}...\n")
    print("-" * 80)

Search results for: 'mountains in europe'

Rank 1 (Distance: 0.7334)
Title: 5 Reasons For A Ski Trip To Europe! – Mark's Travel Journal
Location: Europe
Content Snippet: travel experience if you are a skier or snowboarder there are at least 5 reasons for a ski trip to europe the alps have a special mix of landscapes style glamour and après ski and when you add the world class ski terrain in austria switzerland germany italy and france a ski trip to europe cant be be...

--------------------------------------------------------------------------------
Rank 2 (Distance: 0.7575)
Title: 
Chill Out | Travel Between The Pages
Location: the Western Hemisphere
Content Snippet: summer has arrived early in my little corner of the western hemisphere and with the wilting heat my thoughts turn to the far north coincidentally the worlds largest and most northern national park is celebrating its 50th anniversary kalaallit nunaanni nuna eqqissisimatitaq orgrønlands nationalpark o...

------------------

In [None]:
results

Unnamed: 0,id,blog_url,page_url,page_title,page_description,page_author,location_name,latitude,longitude,content,distance
1161,1159,https://markstraveljournal.wordpress.com/,https://markstraveljournal.me/2019/11/14/5-rea...,5 Reasons For A Ski Trip To Europe! – Mark's T...,"If you are a skier or snowboarder, there are a...",,Europe,51.0,10.0,travel experience if you are a skier or snowbo...,0.733406
6749,7918,https://travelbetweenthepages.wordpress.com/,https://travelbetweenthepages.com/2024/05/28/c...,\nChill Out | Travel Between The Pages,Summer has arrived early in my little corner o...,,the Western Hemisphere,45.419592,-75.708378,summer has arrived early in my little corner o...,0.75749
2029,2043,https://thewomenstravelgroup.wordpress.com/,https://thewomenstravelgroup.com/seeing-the-wo...,Seeing the World High Up - The Women's Travel ...,Seeing the World High Up - cooking trips in Italy,,Italy,42.638426,12.674297,homecooking trips in italyseeing the world hig...,0.765224
3702,4179,https://thriftytravelmama.wordpress.com/,https://thriftytravelmama.wordpress.com/2013/0...,\nKastelburg – Castle Ruins for Kids | Thrifty...,One of the things I love most about living in ...,,Kastelburg,48.097558,7.957541,one of the things i love most about living in ...,0.765539
5609,6477,https://travelsthroughmylenstravel.wordpress.com/,https://travelsthroughmylens.com/2023/03/02/gi...,"Gibraltar, the Mighty Rock – Travels Through M...","Gibraltar, also known as the Rock, is one of f...",,Mighty Rock,13.130837,-59.633284,travels through my lens sharing photos and sto...,0.774408
