### Data Ingestion

##### Langchain Document Structure

In [14]:
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [27]:
def process_all_pdf(pdf_directory):
    all_documents =[]
    pdf_dir = Path(pdf_directory)
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print("Total pdf files: ", len(pdf_files))

    for pdf_file in pdf_files:
        print(f"\nProcessing pdf with name {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            document = loader.load()

            for doc in document:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = "pdf"
                print("Metadata added Successfully")

            all_documents.extend(document)
            print(f"{len(document)} pages loaded")
        except Exception as e:
            print(f"Error occurred in {pdf_file.name}")
    print(f"Total {len(all_documents)} pages")
    return all_documents



In [28]:
pdf_dir = "../data/pdfs"
documents  = process_all_pdf(pdf_dir)
documents

Total pdf files:  3

Processing pdf with name attention.pdf
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
11 pages loaded

Processing pdf with name llms.pdf
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Successfully
Metadata added Succes

[Document(metadata={'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'publisher': 'Curran Associates, Inc.', 'language': 'en-US', 'created': '2017', 'eventtype': 'Poster', 'description-abstract': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be superiorin quality while being more parallelizable and requiring significantly less timeto train. Our single model with 165 million parameters, achieves 27.5 BLEU onEnglish-to-German translation, improving over the existing best ensemble result by over 1 BLEU. On 

In [35]:
  ### Split into Chunks  
  def split_document(document, chunk_size=1000, chunk_overlap=200):
     text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
     )

     split_docs = text_splitter.split_documents(document)
     print(f"Splitted {len(document)} documents into {len(split_docs)} chunks")

     if(split_docs):
        print("Example chunk")
        print(f"Content: {split_docs[0].page_content[:200]}......")
        print(f"Metadata: {split_docs[0].metadata}")
    
     return split_docs
  

In [37]:
chunks = split_document(documents)

Splitted 97 documents into 539 chunks
Example chunk
Content: Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz......
Metadata: {'producer': 'PyPDF2', 'creator': 'PyPDF', 'creationdate': '', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'publisher': 'Curran Associates, Inc.', 'language': 'en-US', 'created': '2017', 'eventtype': 'Poster', 'description-abstract': 'The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks in an encoder and decoder configuration. The best performing such models also connect the encoder and decoder through an attentionm echanisms.  We propose a novel, simple network architecture based solely onan attention mechanism, dispensing with recurrence and convolutions entirely.Experiments on two machine translation tasks show these models to be super

### Embeddings and VectorStore DB

In [39]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [40]:
class EmbeddingManager:
    """Handles document embedding generation using sentence transformers"""

    def __init__(self, model_name : str = "all-MiniLM-L6-v2"):
        """Initializing embedding manager
        Args: model_name : hugging face model name for sentence transformer
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """Load sentence transformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded Successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error occurred in loading model {self.model_name} : {e}")
            raise
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """Generate embedding for a list of texts
        Args:
            texts: list of text strings to embed
        Returns: 
            numpy array of embeddings with shape (len(texts), embedding_dims)
        """

        if not self.model:
            raise ValueError("Model not loaded")
        print(f"Generating embeddings for {len(texts)} texts")
        embeddings = self.model.encode(texts, show_progress_bar = True)
        print(f"Generating embeddings with shape {embeddings.shape}")
        return embeddings

    def get_embeddings_dimensions(self) -> int:
        """Get the embedding dimensions of the model"""
        if not self.model:
            raise ValueError("Model not loaded")
        return self.mode.get_sentence_embedding_dimension()

##initializing embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model loaded Successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x162ba7590>

In [43]:
import os
import uuid
import numpy as np
import chromadb
from typing import List, Any

class VectorStore:
    """For storing embeddings in Chroma vector store"""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """
        Args:
            collection_name : Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store
        """
        self.collection_name = collection_name
        self.persist_directory = persist_directory 
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize ChromaDB client and collection"""
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"}
            )

            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"❌ Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store

        Args:
            documents: List of LangChain Document objects
            embeddings: Corresponding embeddings for documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")

        print(f"Adding {len(documents)} documents to vector store...")

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"  
            ids.append(doc_id)

            metadata = dict(doc.metadata)
            metadata["doc_index"] = i
            metadata["content_length"] = len(doc.page_content)
            metadatas.append(metadata)

            documents_text.append(doc.page_content)
            embeddings_list.append(embedding.tolist())

        try:
            self.collection.add( 
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store.")
            print(f"Total documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"❌ Error adding documents to vector store: {e}")
            raise


# Initialize and test
vector_store = VectorStore()
vector_store

Vector store initialized. Collection: pdf_documents
Existing documents in collection: 0


<__main__.VectorStore at 0x165417b10>

In [47]:
texts = [doc.page_content for doc in chunks]

##Generate the embeddings
embeddings = embedding_manager.generate_embeddings(texts)

##Store embedding in vector store
vector_store.add_documents(chunks, embeddings) 

Generating embeddings for 539 texts


Batches: 100%|██████████| 17/17 [00:09<00:00,  1.76it/s]


Generating embeddings with shape (539, 384)
Adding 539 documents to vector store...
Successfully added 539 documents to vector store.
Total documents in collection: 539


### Retriever Pipeline from Vector Store 

In [58]:
class RAGRetriever:
    """Handles query based retriever from the vectore store"""

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
            Initialize the retriever
            Args:
                vector_store: vector database that contains vector embeddings
                embedding_manager: Embedding manager for generate query embeddings
        """

        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a particular query
        Args:
            query: The search query
            top_k: Number of top results to run
            score_threshold: Minimum similarity score threshold
        
        Return:
            List of Dictionaries containing retrieved documnets with their metadata
        """

        print(f"Retrieving documents for query: {query}")
        print(f"Top K {top_k} , Score Threshold: {score_threshold}")

        ##Generate embedding for the asked query
        query_embedding = embedding_manager.generate_embeddings([query])[0]

        ##Search context in vector store
        try: 
            results =  self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results = top_k
            )

            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    similarity = 1-distance

                    if similarity >= score_threshold:
                        retrieved_docs.append({
                            'id' : doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity,
                            'distance': distance,
                            'rank': i+1
                        })
                    
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
         
            else:
                print("No documents found")
            return retrieved_docs 

        except Exception as e:
            print(f"Error during retrieval: {e}")
            return [] 
rag_retriever = RAGRetriever(vector_store, embedding_manager)

In [59]:
rag_retriever

<__main__.RAGRetriever at 0x1654cdc50>

In [61]:
rag_retriever.retrieve("What is Large Language Models")

Retrieving documents for query: What is Large Language Models
Top K 5 , Score Threshold: 0.0
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.51s/it]

Generating embeddings with shape (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_973cf1c7_47',
  'content': 'Qiu), muhammad.saqib@data61.csiro.au (Muhammad Saqib),\nsaeed.anwar@kfupm.edu.sa (Saeed Anwar),\nmuhammad.usman@kfupm.edu.sa (Muhammad Usman),\nnaveed.akhtar1@unimelb.edu.au (Naveed Akhtar),\nnick.barnes@anu.edu.au (Nick Barnes), ajmal.mian@uwa.edu.au\n(Ajmal Mian)\nFigure 1: The trend of papers released over the years containing keywords\n"Large Language Model", "Large Language Model+ Fine-Tuning", and "Large\nLanguage Model + Alignment".\nPreprint submitted to Elsevier October 18, 2024\narXiv:2307.06435v10  [cs.CL]  17 Oct 2024',
  'metadata': {'keywords': '',
   'producer': 'pdfTeX-1.40.25',
   'creationdate': '2024-10-18T00:29:28+00:00',
   'source_file': 'llms.pdf',
   'doc_index': 47,
   'trapped': '/False',
   'subject': '',
   'author': 'Humza Naveed; Asad Ullah Khan; Shi Qiu; Muhammad Saqib; Saeed Anwar; Muhammad Usman; Naveed Akhtar; Nick Barnes; Ajmal Mian;',
   'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX 

## Integrationg VectorDB Context Pipeline with LLM output

In [73]:
## simple rag pipeline with GROQ LLM

from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv() 

##Initialize groq LLM and set GROQ_API_KEY in Environments
groq_api_key = os.getenv("GROQ_API_KEY") 
llm=ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.1-8b-instant",temperature=0.1,max_tokens=1024)

def rag(query, retriever, llm , top_k=3):
    """Retreieve the context"""
    results = retriever.retrieve(query, top_k=top_k)
    context="\n\n".join([doc["content"] for doc in results]) if results else ""
    if not context:
        print(f"No relevant context found to answer the question")
    
    """Generate the answer using groq LLM"""

    prompt = f"""Use the following context to answer the question concisely.
        Context: {context}
        Question: {query}
        Answer: 
    """

    response = llm.invoke([prompt.format(context=context, query=query)])
    return response.content
     

In [74]:
ans = rag("What is a Large Language Model", rag_retriever, llm)
ans

Retrieving documents for query: What is a Large Language Model
Top K 3 , Score Threshold: 0.0
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.11it/s]


Generating embeddings with shape (1, 384)
Retrieved 3 documents (after filtering)


'A Large Language Model (LLM) is a type of artificial intelligence model that has demonstrated remarkable capabilities in natural language processing tasks.'

### Enhanced RAG Pipeline

In [91]:
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    Rag with extra features :
        retruns answers, sources, confidence, and optionally full context
    """

    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {"source": "No relevant context found", "source": [] , "confidence": 0.0, "context": ""}

    context = "\n\n".join([doc["content"] for doc in results])
    sources = [{
         "source" : doc["metadata"].get("source", doc["metadata"].get("source", "unknown")),
         "page" : doc["metadata"].get("page", "unknown"),
         "score" : doc["similarity_score"],
         "preview" : doc["content"][:300] + "....."
    } for doc in results]

    confidence = max([doc["similarity_score"] for doc in results])

    prompt = f"""Use the follwoing context to answer the following question.
        context : {context}
        question : {query}
        answer : ""
    """

    response = llm.invoke([prompt.format(context=context, query=query)])

    output = {
        "answer" : response.content,
        "source" : sources,
        "confidence" : confidence
    }

    if return_context:
        output["context"] = context

    return output

In [96]:
ans = rag_advanced("What is multi head attention?", rag_retriever, llm, top_k=3, min_score=0.0, return_context=True)
ans

Retrieving documents for query: What is multi head attention?
Top K 3 , Score Threshold: 0.0
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.51it/s]


Generating embeddings with shape (1, 384)
Retrieved 2 documents (after filtering)


{'answer': 'Multi-head attention is a mechanism used in the Transformer model that allows the model to jointly attend to information from different representation subspaces at different positions. It is achieved by applying attention in parallel to linearly projected versions of the queries, keys, and values, and then concatenating and projecting the output. This is done using multiple attention "heads" (in this case, 8 parallel attention layers), each with its own set of learned linear projections.',
 'source': [{'source': '../data/pdfs/attention.pdf',
   'page': 4,
   'score': 0.13510406017303467,
   'preview': 'MultiHead(Q,K,V ) = Concat(head1,..., headh)WO\nwhere headi = Attention(QWQ\ni ,KW K\ni ,VW V\ni )\nWhere the projections are parameter matricesWQ\ni ∈Rdmodel×dk , WK\ni ∈Rdmodel×dk , WV\ni ∈Rdmodel×dv\nand WO ∈Rhdv×dmodel .\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these .....'},
  {'source': '../data/pdfs/attention.pdf',
   'page': 3,
  

In [97]:
# --- Advanced RAG Pipeline: Streaming, Citations, History, Summarization ---
from typing import List, Dict, Any
import time

class AdvancedRAGPipeline:
    def __init__(self, retriever, llm):
        self.retriever = retriever
        self.llm = llm
        self.history = []  # Store query history

    def query(self, question: str, top_k: int = 5, min_score: float = 0.2, stream: bool = False, summarize: bool = False) -> Dict[str, Any]:
        # Retrieve relevant documents
        results = self.retriever.retrieve(question, top_k=top_k, score_threshold=min_score)
        if not results:
            answer = "No relevant context found."
            sources = []
            context = ""
        else:
            context = "\n\n".join([doc['content'] for doc in results])
            sources = [{
                'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
                'page': doc['metadata'].get('page', 'unknown'),
                'score': doc['similarity_score'],
                'preview': doc['content'][:120] + '...'
            } for doc in results]
            # Streaming answer simulation
            prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"""
            if stream:
                print("Streaming answer:")
                for i in range(0, len(prompt), 80):
                    print(prompt[i:i+80], end='', flush=True)
                    time.sleep(0.05)
                print()
            response = self.llm.invoke([prompt.format(context=context, question=question)])
            answer = response.content

        # Add citations to answer
        citations = [f"[{i+1}] {src['source']} (page {src['page']})" for i, src in enumerate(sources)]
        answer_with_citations = answer + "\n\nCitations:\n" + "\n".join(citations) if citations else answer

        # Optionally summarize answer
        summary = None
        if summarize and answer:
            summary_prompt = f"Summarize the following answer in 2 sentences:\n{answer}"
            summary_resp = self.llm.invoke([summary_prompt])
            summary = summary_resp.content

        # Store query history
        self.history.append({
            'question': question,
            'answer': answer,
            'sources': sources,
            'summary': summary
        })

        return {
            'question': question,
            'answer': answer_with_citations,
            'sources': sources,
            'summary': summary,
            'history': self.history
        }

In [103]:
# Example usage:
adv_rag = AdvancedRAGPipeline(rag_retriever, llm)
result = adv_rag.query("what are Large Language Models", top_k=3, min_score=0.0, stream=True, summarize=True)
print("\nFinal Answer:", result['answer'])
print("Summary:", result['summary'])
print("History:", result['history'][-1])
# result

Retrieving documents for query: what are Large Language Models
Top K 3 , Score Threshold: 0.0
Generating embeddings for 1 texts


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.24it/s]

Generating embeddings with shape (1, 384)
Retrieved 3 documents (after filtering)
Streaming answer:
Use the following context to answer the question concisely.
Context:
Qiu), muhammad.saqib@data61.csiro.au (Muhammad Saqib),
saeed.anwar@kfupm.edu.sa (Saeed Anwar),
muhammad.usman@kfupm.edu.sa (Muhammad Usman),
naveed.akhtar1@unimelb.edu.au (Naveed Akhtar),
nick.barnes@anu.edu.au (Nick Barnes), ajmal.mian@uwa.edu.au
(Aj




mal Mian)
Figure 1: The trend of papers released over the years containing keywords
"Large Language Model", "Large Language Model+ Fine-Tuning", and "Large
Language Model + Alignment".
Preprint submitted to Elsevier October 18, 2024
arXiv:2307.06435v10  [cs.CL]  17 Oct 2024

windows for large language models, in: Proceedings of the 61st Annual
Meeting of the Association for Computational Linguistics (V olume 1:
Long Papers), 2023, pp. 6383–6402. 18
[191] W. Wang, L. Dong, H. Cheng, X. Liu, X. Yan, J. Gao, F. Wei,
Augmenting language models with long-term memory, arXiv preprint
arXiv:2306.07174 (2023). 18
[192] X. Xu, Z. Gou, W. Wu, Z.-Y . Niu, H. Wu, H. Wang, S. Wang, Long
time no see! open-domain conversation with long-term persona memory,
arXiv preprint arXiv:2203.05797 (2022). 18
[193] S. Borgeaud, A. Mensch, J. Ho ffmann, T. Cai, E. Rutherford, K. Milli-
can, G. B. Van Den Driessche, J.-B. Lespiau, B. Damoc, A. Clark, et al.,
Improving language models by retrieving from trillions o