In [None]:
# --- 1. Installation and Imports ---
!pip install -q sentence-transformers chromadb torch

import chromadb
from sentence_transformers import SentenceTransformer, util
import torch
import os # Included for cleanliness, though not strictly needed here

# --- Configuration ---
HF_REPO_ID = "zacCMU/miniLM2-ENG3"



# --- 3. Model Download and Load ---
# SentenceTransformer handles downloading the model from the Hub
print(f"Downloading and loading model: {HF_REPO_ID}")
embedder = SentenceTransformer(HF_REPO_ID)


# --- 4. ChromaDB Embedding Function (Custom Wrapper) ---
# This wrapper correctly implements the ChromaDB EmbeddingFunction interface.
class CustomSBERTEmbeddingFunction(chromadb.EmbeddingFunction):
    def __init__(self, model):
        # We store the loaded SentenceTransformer model instance
        self._model = model
    def __call__(self, texts):
        # Outputs a list of lists of floats as ChromaDB expects
        embeddings = self._model.encode(texts, convert_to_tensor=False).tolist()
        return embeddings
    # Mandatory name method for ChromaDB's validation logic
    def name(self):
        return "custom_sbert_wrapper"

# Instantiate the wrapper with the downloaded model
custom_ef = CustomSBERTEmbeddingFunction(embedder)

# --- 5. Data Preparation and Vector Store Building ---
documents = [
    "A **PID controller** (Proportional-Integral-Derivative controller) is a control loop mechanism that continuously calculates an error value as the difference between a desired setpoint and a measured process variable.",
    "The **Von Neumann architecture** separates memory and I/O into two distinct buses, unlike the Harvard architecture which uses shared buses.",
    "**Finite Element Analysis (FEA)** is a computational method for predicting how a product reacts to real-world forces, heat, vibration, and fluid flow. It uses a mesh to discretize complex geometries.",
    "**Thermodynamics' Second Law** states that the total entropy of an isolated system can only increase over time. It can never decrease.",
    "The **Shannon-Weaver model** of communication consists of six elements: Sender, Encoder, Channel, Noise, Decoder, and Receiver.",
    "In software engineering, **Agile methodologies** focus on iterative development, delivering working software frequently, and adapting to change.",
]

# Build the Vector Store (in-memory)
client = chromadb.Client()

collection = client.get_or_create_collection(
    name="engineering_corpus",
    embedding_function=custom_ef
)

collection.add(
    documents=documents,
    ids=[f"doc_{i}" for i in range(len(documents))]
)
print(f"âœ… Indexed {len(documents)} documents into ChromaDB.")

# --- 6. Retrieval Test ---
query = "In computer systems, how is memory accessed differently from input/output components?"

print(f"\n--- Testing Retrieval for Query: '{query}' ---")

# Use the custom model via the collection to find the most relevant context
results = collection.query(
    query_texts=[query],
    n_results=2,  # Retrieve the top 2 results
    include=['documents', 'distances']
)

# --- 7. Final Retrieval Output ---
print("\n--- Top Retrieved Contexts ---")

for i, (doc, dist) in enumerate(zip(results['documents'][0], results['distances'][0])):
    print(f"{i+1}. Distance: {dist:.4f}")
    print(f"   Document: {doc}")

# --- Optional: Basic Semantic Test (using model directly) ---
# NOTE: This part was kept separate from the main RAG logic for clarity,
# but it verifies the model's ability to calculate similarity.
sentences_to_test = [
    "The Von Neumann architecture uses shared buses.",
    "The Von Neumann architecture separates memory and I/O.",
    "This has nothing to do with physics or communication theory."
]

embeddings = embedder.encode(sentences_to_test, convert_to_tensor=True)
similarity = util.cos_sim(embeddings[0], embeddings[1])
dissimilarity = util.cos_sim(embeddings[0], embeddings[2])

print("\n--- Model Direct Similarity Test ---")
print(f"Related Similarity: {similarity.item():.4f}")
print(f"Unrelated Similarity: {dissimilarity.item():.4f}")


def add_documents_to_collection(collection: chromadb.api.models.Collection, docs: list[str]):
    """
    Adds a list of text documents to a given ChromaDB collection.

    Args:
        collection: The ChromaDB Collection object.
        docs: A list of string documents to be indexed.
    """
    ids = [f"doc_{i}" for i in range(len(docs))]
    collection.add(
        documents=docs,
        ids=ids
    )

def retrieve_documents(collection: chromadb.api.models.Collection, query: str, n_results: int = 5) -> dict:
    """
    Retrieves the top N relevant documents from the ChromaDB collection based on a query.

    Args:
        collection: The ChromaDB Collection object.
        query: The search query string.
        n_results: The number of documents to retrieve.

    Returns:
        A dictionary containing the retrieval results (documents and distances).
    """
    results = collection.query(
        query_texts=[query],
        n_results=n_results,
        include=['documents', 'distances']
    )
    return results