## Data Ingestion

In [20]:
### document  datastructure

from langchain_core.documents import Document 

In [21]:
doc = Document(
    page_content = "this is the content of the document",
    metadata = {"author": "John Doe", "length": 42, "date_created": "2024-06-15"}
)
doc

Document(metadata={'author': 'John Doe', 'length': 42, 'date_created': '2024-06-15'}, page_content='this is the content of the document')

In [22]:
# create a simple txt file
import os
os.makedirs("../data/text_files", exist_ok=True
            )

In [23]:
sample_text = {"data/text_files/python_intro.txt":
               """Python is a high-level, interpreted programming language known for its readability and versatility. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python has a large standard library and a vibrant ecosystem of third-party packages, making it suitable for a wide range of applications such as web development, data analysis, artificial intelligence, scientific computing, and more.""",
                "data/text_files/machine_learning.txt":"""Machine learning is a subset of artificial intelligence that focuses on the development of algorithms and statistical models that enable computers to perform tasks without explicit instructions. It involves training models on large datasets to recognize patterns and make predictions or decisions based on new data. Common types of machine learning include supervised learning, unsupervised learning, and reinforcement learning. Applications of machine learning span various fields, including image and speech recognition, natural language processing, recommendation systems, and autonomous vehicles."""
               }

for file_path, content in sample_text.items():
    with open(f"../{file_path}", "w", encoding="utf-8") as f:
        f.write(content)

In [24]:
### TextLoader
# from langchain_core.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

loader=TextLoader("../data/text_files/python_intro.txt", encoding="utf-8")
document = loader.load()
document

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python is a high-level, interpreted programming language known for its readability and versatility. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python has a large standard library and a vibrant ecosystem of third-party packages, making it suitable for a wide range of applications such as web development, data analysis, artificial intelligence, scientific computing, and more.')]

In [25]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "../data/text_files",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"},
    show_progress=False
)

documents=dir_loader.load()
documents

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content='Python is a high-level, interpreted programming language known for its readability and versatility. It supports multiple programming paradigms, including procedural, object-oriented, and functional programming. Python has a large standard library and a vibrant ecosystem of third-party packages, making it suitable for a wide range of applications such as web development, data analysis, artificial intelligence, scientific computing, and more.'),
 Document(metadata={'source': '../data/text_files/machine_learning.txt'}, page_content='Machine learning is a subset of artificial intelligence that focuses on the development of algorithms and statistical models that enable computers to perform tasks without explicit instructions. It involves training models on large datasets to recognize patterns and make predictions or decisions based on new data. Common types of machine learning include supervised learning, un

In [26]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

## load all the text files from the directory
dir_loader=DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf", ## Pattern to match files  
    loader_cls= PyMuPDFLoader, ##loader class to use
    show_progress=False

)

pdf_documents=dir_loader.load()
pdf_documents

[Document(metadata={'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': '../data/pdf/NIPS-2017-attention-is-all-you-need-Paper.pdf', 'file_path': '../data/pdf/NIPS-2017-attention-is-all-you-need-Paper.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz

In [27]:
### Text splitting get into chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

chunks=split_documents(pdf_documents)

Split 33 documents into 106 chunks

Example chunk:
Content: Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz...
Metadata: {'producer': 'PyPDF2', 'creator': '', 'creationdate': '', 'source': '../data/pdf/NIPS-2017-attention-is-all-you-need-Paper.pdf', 'file_path': '../data/pdf/NIPS-2017-attention-is-all-you-need-Paper.pdf', 'total_pages': 11, 'format': 'PDF 1.3', 'title': 'Attention is All you Need', 'author': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin', 'subject': 'Neural Information Processing Systems http://nips.cc/', 'keywords': '', 'moddate': '2018-02-12T21:22:10-08:00', 'trapped': '', 'modDate': "D:20180212212210-08'00'", 'creationDate': '', 'page': 0}


## embedding and vectorStoreDB

In [28]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid # unique id generator
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
class EmbeddingManager:
    """Handles document embedding generation using Sentence Transformers."""

    def __init__(self, model_name:str = "all-MiniLM-L6-v2"):
        """Initialise the embedding manager
        Args:
            model_name: HuggingFace model name for sentence embeddings"""
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self): # private method
        """Load the sentence transformer model."""
        try:
            print(f"Loading embedding model: {self.model_name}...")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model loaded successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise e
        
    def generate_embeddings(self, texts:List[str]) -> np.ndarray:
        """Generate embeddings for a list of texts. 
        
        Returns: 
            numpy array of embeddings with shape (len(texts), embedding_dim)"""
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        print(f'Generated embeddings for {len(texts)} texts. Shape: {embeddings.shape}')
        return embeddings

    def get_embedding_dimension(self) -> int:
        """Get the dimension of the embeddings."""
        if not self.model:
            raise ValueError("Embedding model is not loaded.")
        return self.model.get_sentence_embedding_dimension()
    

### Initialise embedding manager
embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2...
Model loaded successfully. Embedding dimension: 384
Model loaded successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x12af5ef90>

### Vector Store

In [30]:
class VectorStore:
    """Manages document embeddings in a ChromaDB vector store."""

    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store"):
        """ Initialie the vector store
        
        Args: 
            collection_name: Name of the ChromaDB collection
            persist_directory: Directory to persist the vector store data"""
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_vector_store()

    def _initialize_vector_store(self):
        """Initialise the ChromaDB client and collection."""
        try:
            # Create persistent ChromaDB client 
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create collection
            self.collection = self.client.get_or_create_collection(name=self.collection_name, metadata={"description": "PDF document embeddings for RAG"})
            print(f"Vector store initialized with collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise e
        

    def add_documents(self, documents: List[Any], embeddings:np.ndarray):
        """
        Add documents and their embeddings to the vector store. 

        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents and embeddings must match.")
        print(f"Adding {len(documents)} documents to the vector store...")

        # Prepare data for insertion
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i 
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            # Document content
            documents_text.append(doc.page_content)

            # Embedding
            embeddings_list.append(embedding.tolist())

        try:
            # Insert into collection
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=documents_text,
                embeddings=embeddings_list
            )
            print(f"Successfully added {len(documents)} documents to the vector store.")

        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise e
        
vectorstore = VectorStore()

Vector store initialized with collection: pdf_documents
Existing documents in collection: 106


In [31]:
### Convert the text to embeddings
texts=[doc.page_content for doc in chunks]

### generate the embeddings
embeddings = EmbeddingManager().generate_embeddings(texts)

### store in the vector database
vectorstore.add_documents(chunks, embeddings)

Loading embedding model: all-MiniLM-L6-v2...
Model loaded successfully. Embedding dimension: 384
Model loaded successfully. Embedding dimension: 384
Generated embeddings for 106 texts. Shape: (106, 384)
Adding 106 documents to the vector store...
Successfully added 106 documents to the vector store.
Generated embeddings for 106 texts. Shape: (106, 384)
Adding 106 documents to the vector store...
Successfully added 106 documents to the vector store.


In [32]:
### Check current collection status
print(f"Total documents in vector store: {vectorstore.collection.count()}")
print(f"Current chunks to add: {len(chunks)}")

Total documents in vector store: 212
Current chunks to add: 106


### Retriever Pipeline from VectorStore

In [33]:
class RAGRetriever:
    """Retriever for fetching relevant documents from the vector store based on a query."""
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever

        Args:
            vector_store: Vector store containing document embeddings
            embedding_manager: Manager for generating query embeddings
        """ 
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    
    def retrieve(self, query: str, top_k:int = 5, score_threshold: float= 0.0) -> List[Dict[str, Any]]:
        """ 
        Retrieve relevant documents for a query 

        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
        
        Returns: 
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score Threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        # Search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings = [query_embedding.tolist()],
                n_results=top_k)
            
            # Process results
            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold: 
                        retrieved_docs.append({
                            'id': doc_id, 
                            'content': document, 
                            'metadata': metadata, 
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })

                print(f"Retrieved {len(retrieved_docs)} documents after applying score threshold.")

            else:
                print("No documents found for the given query.")

            return retrieved_docs 
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

            

rag_retriever=RAGRetriever(vectorstore, embedding_manager)

In [34]:
rag_retriever.retrieve("What is a attention mechanism?", top_k=3, score_threshold=0.0)

Retrieving documents for query: 'What is a attention mechanism?'
Top K: 3, Score Threshold: 0.0
Generated embeddings for 1 texts. Shape: (1, 384)
Retrieved 0 documents after applying score threshold.


[]

In [35]:
### Debug: Check what PDFs are in the vector store
all_results = vectorstore.collection.get()
sources = set([meta.get('source', 'unknown') for meta in all_results['metadatas']])
print(f"PDFs in vector store ({len(sources)} files):")
for source in sorted(sources):
    print(f"  - {source}")
    
# Check if attention paper exists
attention_docs = [i for i, meta in enumerate(all_results['metadatas']) 
                  if 'attention' in meta.get('source', '').lower()]
print(f"\nDocuments from 'attention' paper: {len(attention_docs)}")

# Sample a chunk to verify content
if attention_docs:
    sample_idx = attention_docs[0]
    print(f"\nSample chunk from attention paper:")
    print(f"Content preview: {all_results['documents'][sample_idx][:300]}...")
else:
    print("\n⚠️ WARNING: No attention paper found in vector store!")

PDFs in vector store (3 files):
  - ../data/pdf/Best 6 AI Patent Drafting Tools in 2025.pdf
  - ../data/pdf/Impact-of-AI-on-Engineering-Innovation.pdf
  - ../data/pdf/NIPS-2017-attention-is-all-you-need-Paper.pdf

Documents from 'attention' paper: 86

Sample chunk from attention paper:
Content preview: Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aid...


## Integration Vectordb Context pipeline with LLM Output

In [36]:
### Simple RAG pipeline with Gemini LLM

from langchain_google_genai import ChatGoogleGenerativeAI
import os
from dotenv import load_dotenv
load_dotenv("../.env")  # Load environment variables from .env file

gemini_api_key = os.getenv("GOOGLE_GENAI_API_KEY")
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=gemini_api_key,
    temperature=0.7
)

### 2. Simple RAG function: retrieve context and generate answer
def rag_simple(query: str, retriever: RAGRetriever, llm, top_k=3):
    ## retrieve the context
    results = retriever.retrieve(query, top_k=top_k)
    context="\n\n".join([doc['content'] for doc in results]) if results else ""
    if not context: 
        return "No relevant context found."

    ## generate the answer using Gemini LLM
    prompt = f"""You are an AI assistant that provides accurate and concise answers based on the provided context.
        Use the following context to answer the question:

        Context: {context}

        Question: {query}

        Answer:"""

    response = llm.invoke(prompt)
    return response.content

In [37]:
answer = rag_simple("attention mechanism", rag_retriever, llm, top_k=3)
print("RAG Answer:", answer)

Retrieving documents for query: 'attention mechanism'
Top K: 3, Score Threshold: 0.0
Generated embeddings for 1 texts. Shape: (1, 384)
Retrieved 3 documents after applying score threshold.
Generated embeddings for 1 texts. Shape: (1, 384)
Retrieved 3 documents after applying score threshold.
RAG Answer: The attention mechanism described includes multi-head attention and self-attention.

Multi-head attention involves linearly projecting queries, keys, and values `h` times with different learned projections to `dk`, `dk`, and `dv` dimensions, respectively. The attention function is performed in parallel on each of these projected versions, producing `dv`-dimensional output values. These outputs are then concatenated and projected again to yield the final values. This allows the model to jointly attend to information from different representation subspaces at various positions, overcoming the limitations of a single attention head where averaging inhibits this.

Self-attention, combined w

In [38]:
### Quick check: See retrieval results
results = rag_retriever.retrieve("attention mechanism", top_k=5, score_threshold=0.0)
print(f"Total results: {len(results)}\n")
for i, doc in enumerate(results, 1):
    print(f"{i}. Score: {doc['similarity_score']:.3f}")
    print(f"   Source: {doc['metadata'].get('source', 'unknown').split('/')[-1]}")
    print(f"   Page: {doc['metadata'].get('page', 'unknown')}")
    print(f"   Preview: {doc['content'][:100]}...\n")

Retrieving documents for query: 'attention mechanism'
Top K: 5, Score Threshold: 0.0
Generated embeddings for 1 texts. Shape: (1, 384)
Retrieved 5 documents after applying score threshold.
Total results: 5

1. Score: 0.091
   Source: NIPS-2017-attention-is-all-you-need-Paper.pdf
   Page: 3
   Preview: we found it beneﬁcial to linearly project the queries, keys and values h times with different, learn...

2. Score: 0.091
   Source: NIPS-2017-attention-is-all-you-need-Paper.pdf
   Page: 3
   Preview: we found it beneﬁcial to linearly project the queries, keys and values h times with different, learn...

3. Score: 0.050
   Source: NIPS-2017-attention-is-all-you-need-Paper.pdf
   Page: 6
   Preview: convolution is equal to the combination of a self-attention layer and a point-wise feed-forward laye...

4. Score: 0.050
   Source: NIPS-2017-attention-is-all-you-need-Paper.pdf
   Page: 6
   Preview: convolution is equal to the combination of a self-attention layer and a point-wise feed-forwar

## Enhanced RAG Pipeline Features

In [None]:
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with etra features:
    - Returns answer, sources, confidence score, and optionally full context. 
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found. ', 'sources':[], 'confidence':0.0, 'context':''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page','unknown'),
        'similarity_score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])


    # Generate answer
    prompt = f"""Use the following context to answer the question concisely. \nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])

    output = {
        'answer': response.content, 
        'sources': sources, 
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("Give me some tips on patents", rag_retriever, llm, top_k=3, min_score=0.1, rsaeturn_context=True)

print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'Give me some tips on patents'
Top K: 3, Score Threshold: 0.1
Generated embeddings for 1 texts. Shape: (1, 384)
Retrieved 3 documents after applying score threshold.
Answer: Based on the context, here are some tips related to patents:

*   **Prioritize efficiency and accuracy:** Aim to increase these in patent preparation.
*   **Adhere to best practices:** Focus on following best practices throughout the patent process.
*   **Seek cost and time reduction:** Look for ways to reduce the time and cost associated with patent preparation.
*   **Develop full scope of protection:** Strive to develop the most comprehensive protection for your inventions.
*   **Maintain professional oversight:** Even when using advanced tools, keep patent professionals at the helm and in control of the process.
*   **Address all patenting stages:** Consider drafting, filing, prosecution, office action analysis, portfolio strategy/management, and future infringement analyses.
Sour