# Analyze Vector DB

In [None]:
from langchain_chroma import Chroma
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

## Specify the embedding model and vector DB

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
database_loc = "/path/to/vectorDB"
vectorstore = Chroma(persist_directory=database_loc,
      embedding_function=embedding_model)

### (optional) Print the contents

In [None]:
all_docs = vectorstore.get()['documents']

print(f"docs: {len(all_docs)}")

In [None]:
print(vectorstore.get().keys())

## Test Re-ranker and compare the results

In [None]:
from typing import List
from langchain_core.runnables import chain
from langchain_core.documents import Document
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

@chain
def retriever(query: str) -> List[Document]:
    docs, scores = zip(*vectorstore.similarity_search_with_score(query, k=200))
    for doc, score in zip(docs, scores):
        doc.metadata["original_score"] = score

    return docs

In [None]:
# === 2. Get top-k documents ===
phrase = "Why do I need both sliver and bastion keys?"
results = retriever.invoke(phrase)

docs = [{
    "id": str(i),
    "text": x.page_content,
    "metadata": x.metadata
} for i, x in enumerate(results)]

In [None]:
# === 3. Load reranker model ===

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-v2-m3")
model = AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-v2-m3").to(device)
print(f"Using device: {device}")
model.eval()

In [None]:
# === 4. Prepare inputs for reranking ===
pairs = [(phrase, doc["text"]) for doc in docs]


with torch.no_grad():
    inputs = tokenizer.batch_encode_plus(
        pairs,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=512
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model(**inputs)
    scores = outputs.logits.squeeze().tolist()

# Add reranked scores to docs
for doc, score in zip(docs, scores):
    doc["metadata"]["rerank_score"] = score


In [None]:

# === 5. Print and compare rankings ===
print("\n=== Original Ranking ===")
for doc in docs:
    print(f"Doc ID: {doc['id']}, Original Score: {doc['metadata']['original_score']:.4f}, URL: {doc['metadata']['source']}")

print("\n=== Reranked by BAAI/bge-reranker-v2-m3 ===")
for doc in sorted(docs, key=lambda x: x["metadata"]["rerank_score"], reverse=True):
    print(f"Doc ID: {doc['id']}, Rerank Score: {doc['metadata']['rerank_score']:.4f}, URL: {doc['metadata']['source']}")