# Analyze Vector DB

In [None]:
from langchain_chroma import Chroma
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

## Specify the embedding model and vector DB

In [None]:
# ========== CONFIGURATION ==========
# Change this to analyze different databases
DATABASE_OPTIONS = {
    "qa": "../../data/vectordbs/qa_tool/Cleaned_DB",
    "code_gen": "../../data/vectordbs/code_gen/",
    "custom": "my_vector_store"  # For ad-hoc testing
}

ANALYZE_DB = "qa"  # <<< Change this to switch databases
# ===================================

database_loc = DATABASE_OPTIONS[ANALYZE_DB]
print(f"Analyzing database: {ANALYZE_DB}")
print(f"Path: {database_loc}")

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

vectorstore = Chroma(persist_directory=database_loc,
      embedding_function=embedding_model)

### (optional) Print the contents

In [None]:
all_docs = vectorstore.get()['documents']

print(f"docs: {len(all_docs)}")

# for idx, doc in enumerate(all_docs):
#     print(f"Document {idx + 1}:")
#     print(doc)
#     print("-" * 80)

## Run a similarity search

In [None]:
from typing import List
from langchain_core.runnables import chain
from langchain_core.documents import Document

@chain
def retriever(query: str) -> List[Document]:
    docs, scores = zip(*vectorstore.similarity_search_with_score(query, k=8))
    for doc, score in zip(docs, scores):
        doc.metadata["score"] = score

    return docs

In [None]:
phrase = "What are slices?"
embedding = HuggingFaceEmbeddings().embed_query(phrase)

results = retriever.invoke(phrase)
#print(results)

for result in results:
    print(result.metadata)

## Compare Pre-Reranking vs Post-Reranking

Retrieves `k` documents via vector similarity, then reranks them with `BAAI/bge-reranker-v2-m3`.
Shows rank changes so you can judge whether reranking is meaningfully reordering results.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

rerank_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-v2-m3")
rerank_model = AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-v2-m3").to(device)
rerank_model.eval()
print("Reranker loaded")

In [None]:
QUERY = "what is the difference between a slice key and a bastion key?"  # <<< Change this
K = 20                      # documents to retrieve (matches RETRIEVAL_K in .env)

# --- Retrieve ---
retrieved = vectorstore.similarity_search_with_score(QUERY, k=K)
pre_docs  = [doc for doc, _ in retrieved]
pre_scores = [score for _, score in retrieved]  # lower = more similar in ChromaDB L2

# --- Rerank ---
pairs = [(QUERY, doc.page_content) for doc in pre_docs]
with torch.no_grad():
    inputs = rerank_tokenizer.batch_encode_plus(
        pairs, padding=True, truncation=True,
        return_tensors="pt", max_length=512
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    rerank_scores = rerank_model(**inputs).logits.squeeze().tolist()

if isinstance(rerank_scores, float):
    rerank_scores = [rerank_scores]

# --- Build comparison table ---
pre_rank = {id(doc): i + 1 for i, doc in enumerate(pre_docs)}
post_docs = sorted(zip(pre_docs, rerank_scores), key=lambda x: x[1], reverse=True)

print(f"Query: '{QUERY}'")
print(f"{'Post':>4}  {'Pre':>4}  {'Move':>5}  {'Rerank score':>12}  Source")
print("-" * 90)
for post_i, (doc, rscore) in enumerate(post_docs, start=1):
    pre_i = pre_rank[id(doc)]
    move  = pre_i - post_i
    arrow = f"▲{move}" if move > 0 else (f"▼{abs(move)}" if move < 0 else "  =")
    source = doc.metadata.get("source", doc.page_content[:60].replace("\n", " "))
    print(f"{post_i:>4}  {pre_i:>4}  {arrow:>5}  {rscore:>12.4f}  {source}")