In [1]:
## RAG Pipeline - DB Ingestion to Vector DB

In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data/pdf_files/")

Found 2 PDF files to process

Processing: attention.pdf
  ✓ Loaded 15 pages

Processing: embeddings.pdf
  ✓ Loaded 8 pages

Total documents loaded: 23


In [4]:
all_pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\pdf_files\\attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'attention.pdf', 'file_type': 'pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversit

In [5]:
## text splitting into chunks
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split into {len(split_docs)} chunks")

    if split_docs:
        print(f"Sample chunk (first document):\n{split_docs[0].page_content[:200]}...\n")
    return split_docs

chunks = split_documents(all_pdf_documents)
chunks

Split into 89 chunks
Sample chunk (first document):
Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
...



[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '..\\data\\pdf_files\\attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1', 'source_file': 'attention.pdf', 'file_type': 'pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversit

In [6]:
## Embedding and vectorstoredb

import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
class EmbeddingManager:
    def __init__(self, model_name: str= "all-MiniLM-L6-v2"):
        self.model_name= model_name
        self.mode= None
        self._load_model()
    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
            print(f"Loaded embedding model: {self.model_name}")
            print(f"Model Dimensionality: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Embedding model not loaded.")
        print(f"Generating embeddings for {len(texts)} texts...")
        embeddings= self.model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    # def get_sentence_embedding_dimension(self) -> int:
    #     if not self.model:
    #         raise ValueError("Embedding model not loaded.")
    #     return self.model.get_sentence_embedding_dimension()

### initialize Embedding Manager
embedding_manager = EmbeddingManager()
embedding_manager 

Loaded embedding model: all-MiniLM-L6-v2
Model Dimensionality: 384


<__main__.EmbeddingManager at 0x1d64d7eb4d0>

In [8]:
## VectorStoreDB

In [10]:
class VectorStore:
    def __init__(self, collection_name: str="pdf_documents", persist_directory: str="../data/vector_store/"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path= self.persist_directory)

            self.collection = self.client.get_or_create_collection(
                name= self.collection_name,
                metadata= {"description": "PDF Documents Vector Store"})
            print(f"Initialized VectorStore with collection: {self.collection_name}")
            print(f"Existing number of vectors in store: {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing VectorStore: {e}")
            raise
    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store
        
        Args:
            documents: List of LangChain documents
            embeddings: Corresponding embeddings for the documents
        """
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        # Prepare data for ChromaDB
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            # Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            # Prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            # Document content
            documents_text.append(doc.page_content)
            
            # Embedding
            embeddings_list.append(embedding.tolist())
        
        # Add to collection
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

vectorstore=VectorStore()
vectorstore 

Initialized VectorStore with collection: pdf_documents
Existing number of vectors in store: 0


<__main__.VectorStore at 0x1d64dabcec0>

In [12]:
## Convert text to embeddings 

texts= [chunk.page_content for chunk in chunks]

## Generate embeddings
embeddings= embedding_manager.generate_embeddings(texts)

## Store in vector DB
vectorstore.add_documents(chunks, embeddings)

Generating embeddings for 89 texts...


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches: 100%|██████████| 3/3 [00:05<00:00,  1.88s/it]


Generated embeddings with shape: (89, 384)
Adding 89 documents to vector store...
Successfully added 89 documents to vector store
Total documents in collection: 89


In [21]:
## retriever from vector DB
class RAGRetriever: 
    def __init__(self, vectorstore: VectorStore, embedding_manager: EmbeddingManager):
        self.vectorstore= vectorstore
        self.embedding_manager= embedding_manager
    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0) -> List[Dict[str, Any]]:
        """
        Retrieve relevant documents for a query
        
        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
            
        Returns:
            List of dictionaries containing retrieved documents and metadata
        """
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        # Search in vector store
        try:
            results = self.vectorstore.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    # Convert distance to similarity score (ChromaDB uses cosine distance)
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [22]:
rag_retriever

<__main__.RAGRetriever at 0x1d64dabe7b0>

In [27]:
rag_retriever.retrieve("explain attention")

Retrieving documents for query: 'explain attention'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 42.92it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_8b4bacff_12',
  'content': '3.2 Attention\nAn attention function can be described as mapping a query and a set of key-value pairs to an output,\nwhere the query, keys, values, and output are all vectors. The output is computed as a weighted sum\n3',
  'metadata': {'author': '',
   'creationdate': '2023-08-03T00:07:29+00:00',
   'page_label': '3',
   'page': 2,
   'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5',
   'source_file': 'attention.pdf',
   'creator': 'LaTeX with hyperref',
   'moddate': '2023-08-03T00:07:29+00:00',
   'trapped': '/False',
   'title': '',
   'producer': 'pdfTeX-1.40.25',
   'doc_index': 12,
   'total_pages': 15,
   'subject': '',
   'content_length': 216,
   'source': '..\\data\\pdf_files\\attention.pdf',
   'keywords': '',
   'file_type': 'pdf'},
  'similarity_score': 0.28536224365234375,
  'distance': 0.7146377563476562,
  'rank': 1},
 {'id': 'doc_75e345d1_49',
  'content': 'Attention Vi

In [25]:
rag_retriever.retrieve("The future of culture in more-than-human worlds of being")

Retrieving documents for query: 'The future of culture in more-than-human worlds of being'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 49.39it/s]

Generated embeddings with shape: (1, 384)
Retrieved 4 documents (after filtering)





[{'id': 'doc_16c67038_78',
  'content': 'It’s not a trick question, or even a nihilistic one . It is to ask, what is left for us to do, once we have \ninvented the ultimate technology? \nThe answer is we co -evolve, and the human side of the equation is that wonderful old cultural \nrole of judgment and taste. What makes humans special is neither our rationality nor our cooperation, \nbut our ability to discern what is good. It is our ability to judge quality, to discern reﬁnement, to have \nstyle. That is something that all of us can do, each of us can develop our skills of taste and judgment \nthrough each lifetime. That is our highest power, individually and collectively, and we can enter a co-\nevolutionary union with machines, a cybernetic feedback loop in which machines propose and \nhumans choose.   \nThe culturetron \nMany years ago, the founder of Cultural Science – John Hartley – had the idea for something he called \nthe Culturetron. It was a play on the idea of a Synchrotro

In [26]:
rag_retriever.retrieve("explain encoder decoder architecture")

Retrieving documents for query: 'explain encoder decoder architecture'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 43.54it/s]

Generated embeddings with shape: (1, 384)
Retrieved 4 documents (after filtering)





[{'id': 'doc_be24d682_9',
  'content': 'Here, the encoder maps an input sequence of symbol representations (x1, ..., xn) to a sequence\nof continuous representations z = (z1, ..., zn). Given z, the decoder then generates an output\nsequence (y1, ..., ym) of symbols one element at a time. At each step the model is auto-regressive\n[10], consuming the previously generated symbols as additional input when generating the next.\n2',
  'metadata': {'page_label': '2',
   'creationdate': '2023-08-03T00:07:29+00:00',
   'file_type': 'pdf',
   'doc_index': 9,
   'content_length': 385,
   'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5',
   'page': 1,
   'source': '..\\data\\pdf_files\\attention.pdf',
   'source_file': 'attention.pdf',
   'subject': '',
   'creator': 'LaTeX with hyperref',
   'keywords': '',
   'trapped': '/False',
   'title': '',
   'producer': 'pdfTeX-1.40.25',
   'total_pages': 15,
   'author': '',
   'moddate': '2023-

In [33]:
## vector db llm context output

from langchain_ollama import ChatOllama
import os
from dotenv import load_dotenv
# Get the notebook's directory and construct path to .env
notebook_dir = Path().absolute()
env_path = notebook_dir.parent / "chatbot" / ".env"

# Load environment variables
load_dotenv(dotenv_path=env_path)

OLLAMA_API_KEY= os.getenv("OLLAMA_API_KEY")

In [34]:
llm= ChatOllama(model="llama2", temperature=0.1, api_key= OLLAMA_API_KEY)

## Simple rag to retrieve context and generate answer
def rag_simple(query, retriever, llm, top_k=3):
    results= retriever.retrieve(query, top_k= top_k)
    context= "\n\n".join([f"Source: {res['metadata'].get('source_file','unknown')}\nContent: {res['content']}" for res in results]) if results else ""
    if not context:
        return "No relevant context found."
    # generate the ans using llm
    prompt= f"Use the following context to answer the question:\n\n{context}\n\nQuestion: {query}\nAnswer:"

    response= llm.invoke([prompt.format(context=context, question=query)])
    return response.content

In [35]:
answer= rag_simple("what is attention mechanism", rag_retriever, llm, top_k=3)
answer

Retrieving documents for query: 'what is attention mechanism'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 21.75it/s]

Generated embeddings with shape: (1, 384)
Retrieved 2 documents (after filtering)





'Based on the provided context, the attention mechanism can be described as a function that maps a query and a set of key-value pairs to an output. The query, keys, values, and output are all vectors. The output is computed as a weighted sum of the values based on the similarity between the query and the keys. In other words, the attention mechanism allows the model to focus on specific parts of the input when computing the output, rather than treating the entire input equally. This is particularly useful in natural language processing tasks, where the input sequences can be long and complex, and the model needs to be able to capture long-distance dependencies between different parts of the sequence.'

In [37]:
def rag_advanced(query, retriever, llm, top_k=5, min_score=0.2, return_context=False):
    """
    RAG pipeline with extra features:
    - Returns answer, sources, confidence score, and optionally full context.
    """
    results = retriever.retrieve(query, top_k=top_k, score_threshold=min_score)
    if not results:
        return {'answer': 'No relevant context found.', 'sources': [], 'confidence': 0.0, 'context': ''}
    
    # Prepare context and sources
    context = "\n\n".join([doc['content'] for doc in results])
    sources = [{
        'source': doc['metadata'].get('source_file', doc['metadata'].get('source', 'unknown')),
        'page': doc['metadata'].get('page', 'unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:300] + '...'
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])
    
    # Generate answer
    prompt = f"""Use the following context to answer the question concisely.\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"""
    response = llm.invoke([prompt.format(context=context, query=query)])
    
    output = {
        'answer': response.content,
        'sources': sources,
        'confidence': confidence
    }
    if return_context:
        output['context'] = context
    return output

# Example usage:
result = rag_advanced("what is attention mechanism", rag_retriever, llm, top_k=3, min_score=0.1, return_context=True)
print("Answer:", result['answer'])
print("Sources:", result['sources'])
print("Confidence:", result['confidence'])
print("Context Preview:", result['context'][:300])

Retrieving documents for query: 'what is attention mechanism'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 39.89it/s]

Generated embeddings with shape: (1, 384)
Retrieved 2 documents (after filtering)





Answer: The attention mechanism is a method used in deep learning models to allow the model to focus on specific parts of the input when computing the output. In the context of the passage, the attention mechanism is used in the encoder self-attention layer of a transformer model to help the model understand which parts of the input sequence are important for computing the output. The attention mechanism works by computing a weighted sum of the input elements, where the weights are learned during training and reflect the importance of each input element for the task at hand. In this passage, the attention mechanism is used to help the model understand which parts of the input sentence are important for understanding the meaning of the verb "making".
Sources: [{'source': 'attention.pdf', 'page': 2, 'score': 0.25035518407821655, 'preview': '3.2 Attention\nAn attention function can be described as mapping a query and a set of key-value pairs to an output,\nwhere the query, keys, values, a