# RAG - E2E Example
- Chunking
- Simple 'Imaginary' Embedding Model - assuming 2 numbers in Text embeddings (vectors) 1:2 dimension (1 row 2 items) each - 1)How much text talks about Medical stuff ; 2)How much text talks about Software stuff.
- Normalisation in results of Text embeddings
- Store embeddings in Vector Database
- User Query -> Embedding Model (same model) -> Find Closest Embedding
- Create Prompt in Messages[] with 1)Prompt to Claude 2)User Query 3)Closest embedding (chunk) from semantic search, send to Claude for an answer. Return to user.

In [35]:
import voyageai
# Client Setup
from dotenv import load_dotenv

load_dotenv()

client = voyageai.Client()

In [36]:
# Chunk by section
import re


def chunk_by_section(document_text):
    pattern = r"\n## "
    return re.split(pattern, document_text)


# Embedding Generation
def generate_embedding(chunks, model="voyage-3-large", input_type="query"):
    is_list = isinstance(chunks, list)
    input = chunks if is_list else [chunks]
    result = client.embed(input, model=model, input_type=input_type)
    return result.embeddings if is_list else result.embeddings[0]

In [37]:
# VectorIndex implementation
import math
from typing import Optional, Any, List, Dict, Tuple


class VectorIndex:
    def __init__(
            self,
            distance_metric: str = "cosine",
            embedding_fn=None,
    ):
        self.vectors: List[List[float]] = []
        self.documents: List[Dict[str, Any]] = []
        self._vector_dim: Optional[int] = None
        if distance_metric not in ["cosine", "euclidean"]:
            raise ValueError("distance_metric must be 'cosine' or 'euclidean'")
        self._distance_metric = distance_metric
        self._embedding_fn = embedding_fn

    def add_document(self, document: Dict[str, Any]):
        if not self._embedding_fn:
            raise ValueError(
                "Embedding function not provided during initialization."
            )
        if not isinstance(document, dict):
            raise TypeError("Document must be a dictionary.")
        if "content" not in document:
            raise ValueError(
                "Document dictionary must contain a 'content' key."
            )

        content = document["content"]
        if not isinstance(content, str):
            raise TypeError("Document 'content' must be a string.")

        vector = self._embedding_fn(content)
        self.add_vector(vector=vector, document=document)

    def search(
            self, query: Any, k: int = 1
    ) -> List[Tuple[Dict[str, Any], float]]:
        if not self.vectors:
            return []

        if isinstance(query, str):
            if not self._embedding_fn:
                raise ValueError(
                    "Embedding function not provided for string query."
                )
            query_vector = self._embedding_fn(query)
        elif isinstance(query, list) and all(
                isinstance(x, (int, float)) for x in query
        ):
            query_vector = query
        else:
            raise TypeError(
                "Query must be either a string or a list of numbers."
            )

        if self._vector_dim is None:
            return []

        if len(query_vector) != self._vector_dim:
            raise ValueError(
                f"Query vector dimension mismatch. Expected {self._vector_dim}, got {len(query_vector)}"
            )

        if k <= 0:
            raise ValueError("k must be a positive integer.")

        if self._distance_metric == "cosine":
            dist_func = self._cosine_distance
        else:
            dist_func = self._euclidean_distance

        distances = []
        for i, stored_vector in enumerate(self.vectors):
            distance = dist_func(query_vector, stored_vector)
            distances.append((distance, self.documents[i]))

        distances.sort(key=lambda item: item[0])

        return [(doc, dist) for dist, doc in distances[:k]]

    def add_vector(self, vector, document: Dict[str, Any]):
        if not isinstance(vector, list) or not all(
                isinstance(x, (int, float)) for x in vector
        ):
            raise TypeError("Vector must be a list of numbers.")
        if not isinstance(document, dict):
            raise TypeError("Document must be a dictionary.")
        if "content" not in document:
            raise ValueError(
                "Document dictionary must contain a 'content' key."
            )

        if not self.vectors:
            self._vector_dim = len(vector)
        elif len(vector) != self._vector_dim:
            raise ValueError(
                f"Inconsistent vector dimension. Expected {self._vector_dim}, got {len(vector)}"
            )

        self.vectors.append(list(vector))
        self.documents.append(document)

    def _euclidean_distance(
            self, vec1: List[float], vec2: List[float]
    ) -> float:
        if len(vec1) != len(vec2):
            raise ValueError("Vectors must have the same dimension")
        return math.sqrt(sum((p - q) ** 2 for p, q in zip(vec1, vec2)))

    def _dot_product(self, vec1: List[float], vec2: List[float]) -> float:
        if len(vec1) != len(vec2):
            raise ValueError("Vectors must have the same dimension")
        return sum(p * q for p, q in zip(vec1, vec2))

    def _magnitude(self, vec: List[float]) -> float:
        return math.sqrt(sum(x * x for x in vec))

    def _cosine_distance(self, vec1: List[float], vec2: List[float]) -> float:
        if len(vec1) != len(vec2):
            raise ValueError("Vectors must have the same dimension")

        mag1 = self._magnitude(vec1)
        mag2 = self._magnitude(vec2)

        if mag1 == 0 and mag2 == 0:
            return 0.0
        elif mag1 == 0 or mag2 == 0:
            return 1.0

        dot_prod = self._dot_product(vec1, vec2)
        cosine_similarity = dot_prod / (mag1 * mag2)
        cosine_similarity = max(-1.0, min(1.0, cosine_similarity))

        return 1.0 - cosine_similarity

    def __len__(self) -> int:
        return len(self.vectors)

    def __repr__(self) -> str:
        has_embed_fn = "Yes" if self._embedding_fn else "No"
        return f"VectorIndex(count={len(self)}, dim={self._vector_dim}, metric='{self._distance_metric}', has_embedding_fn='{has_embed_fn}')"

In [38]:
with open("./report.md", "r") as f:
    text = f.read()

In [39]:
# 1. Chunk the text by section
chunks = chunk_by_section(text)
chunks[0]  #test

'# **Annual Interdisciplinary Research Review: Cross-Domain Insights**\n'

In [40]:
# 2. Generate embeddings for each chunk
embeddings = generate_embedding(chunks)


In [41]:
embeddings

[[-0.05409230664372444,
  0.014001750387251377,
  -0.017084214836359024,
  0.00073939876165241,
  0.021311333402991295,
  0.03750757500529289,
  -0.047247517853975296,
  0.002155399415642023,
  0.002622003899887204,
  -0.018127553164958954,
  0.013968652114272118,
  0.048404891043901443,
  -0.008821332827210426,
  -0.003977086395025253,
  -0.013989274390041828,
  0.007124318741261959,
  0.032850347459316254,
  0.08918570727109909,
  -0.047520384192466736,
  -0.009951098822057247,
  0.060816843062639236,
  -0.057616446167230606,
  0.06537926197052002,
  -0.0349222831428051,
  -0.018815500661730766,
  0.007888268679380417,
  0.007525506895035505,
  0.04448937252163887,
  -0.03307667747139931,
  -0.014281553216278553,
  -0.0008532857173122466,
  0.0220091063529253,
  0.018964940682053566,
  0.056709978729486465,
  0.012462250888347626,
  0.04076347500085831,
  -0.06260434538125992,
  -0.0197377260774374,
  -0.027588415890932083,
  -0.054499220103025436,
  0.010315739549696445,
  0.0049313

In [42]:
# 3. Create a vector store and add each embedding to it
# Note: converted to a bulk operation to avoid rate limiting errors from VoyageAI
store = VectorIndex()

for embedding, chunk in zip(embeddings, chunks):
    store.add_vector(embedding, {
        "content": chunk})  #Vector DB expects this format - the ID of the chunk or the Text in the chunk is stored in Vector DB along with the Text Embedding


In [43]:
# 4. Some time later, a user will ask a question. Generate an embedding for it
user_question = "What did the software engineering dept do last year?"
user_embedding = generate_embedding(user_question)

RateLimitError: You have not yet added your payment method in the billing page and will have reduced rate limits of 3 RPM and 10K TPM. To unlock our standard rate limits, please add a payment method in the billing page for the appropriate organization in the user dashboard (https://dashboard.voyageai.com/). Even with payment methods entered, the free tokens (200M tokens for Voyage series 3) will still apply. After adding a payment method, you should see your rate limits increase after several minutes. See our pricing docs (https://docs.voyageai.com/docs/pricing) for the free tokens for your model.

In [34]:
# 5. Search the store with the embedding, find the 2 most relevant chunks
results = store.search(
    user_embedding,
    k=2  #K = N of top relevant chunks
)

#Results Output
# return [(doc, dist) for dist, doc in distances[:k]]

for doc, distance in results:
    print(distance, "\n", doc["content"][0:200], "\n")

0.4833183050308575 
 Section 2: Software Engineering - Project Phoenix Stability Enhancements

The Software Engineering division dedicated considerable effort to improving the stability and performance of the core systems 

0.4888823735702059 
 Future Directions

This year's cross-domain insights underscore the interconnectedness of our diverse research and operational activities. The stability enhancements achieved in Software Engineering ( 

