In [1]:
import os
import numpy as np
from typing import List, Any, Union
from langchain_community.document_loaders import (
    TextLoader,
    PyPDFLoader,
)
from langchain.docstore.document import Document
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.tools import tool
from qdrant_client import QdrantClient
from langchain.prompts import PromptTemplate
from langgraph.graph import StateGraph, END
from langgraph.graph.message import add_messages
from typing import Annotated
from typing_extensions import TypedDict
from llama_index.llms.ollama import Ollama
from tqdm import tqdm
from langchain_core.messages import HumanMessage
from FlagEmbedding import BGEM3FlagModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SPECIALIZED_LOADERS = {
    ".pdf": PyPDFLoader,
}

# Configuration for chunking and embeddings
CHUNK_SIZE = 4000  # Characters per chunk
CHUNK_OVERLAP = 200  # Overlap between chunks
MAX_INPUT_SIZE = 4000  # Max tokens for embedding model (adjusted for long-input model)

REQUEST_TIMEOUT = 300
CONTEXT_WINDOW = 80000
MODEL_NAME = "qwen2:7b"

FOLDERS_TO_EXCLUDE = [".claude/", ".conda", ".gradio/", "__pycache__", ".git", ".DS_Store"]

In [3]:
# Initialize Ollama LLM for decision making and direct responses
REQUEST_TIMEOUT = 300
CONTEXT_WINDOW = 80000
MODEL_NAME = "qwen2:7b"

llm = Ollama(
    model=MODEL_NAME, 
    context_window=CONTEXT_WINDOW, 
    request_timeout=REQUEST_TIMEOUT
)

# Test Ollama connection
test_response = llm.complete("Hello")
print(f"Ollama initialized: {test_response.text[:50]}...")

# Initialize HuggingFace embeddings model designed for longer inputs
# embeddings_model = HuggingFaceEmbeddings(
#     model_name="sentence-transformers/all-mpnet-base-v2",  # Better for longer texts
#     model_kwargs={'device': 'cpu'},
#     encode_kwargs={'normalize_embeddings': True}
# )
# embeddings_model = BGEM3FlagModel(
#     'BAAI/bge-m3',  
#     use_fp16=True,  
#     normalize_embeddings=True
# ) # Setting use_fp16 to True speeds up computation with a slight performance degradation
embeddings_model = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-8B", 
    encode_kwargs={'normalize_embeddings': True}
)

# Initialize text splitter for chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

print("Embeddings model and text splitter initialized successfully.")

Ollama initialized: Hello! How can I assist you today?...


  embeddings_model = HuggingFaceEmbeddings(
Fetching 4 files:   0%|          | 0/4 [04:34<?, ?it/s]
Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [None]:
def get_all_files_from_directory(directory_path: str) -> List[str]:
    """
    Scans a directory and returns a list of all file paths.
    """
    all_files = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            if not any(folder_to_exclude in file_path for folder_to_exclude in FOLDERS_TO_EXCLUDE):
                all_files.append(file_path)
    return all_files

def load_documents(file_paths: List[str]) -> List[Document]:
    """
    Loads documents from a list of file paths using the appropriate loader.
    Defaults to TextLoader for any unrecognized file type.
    """
    documents = []
    for file_path in file_paths:
        _, extension = os.path.splitext(file_path)
        loader_class = SPECIALIZED_LOADERS.get(extension)

        try:
            if loader_class:
                loader = loader_class(file_path)
            else:
                # Default to TextLoader for all other files 
                loader = TextLoader(file_path, encoding='utf-8')
            documents.extend(loader.load())
        except Exception as e:
            # This will catch errors for true binary files that can't be decoded
            print(f"Skipping file {file_path}, could not be read as text. Error: {e}")
            continue

    return documents

def chunk_and_embed_documents(documents: List[Document]) -> List[Document]:
    """
    Chunks large documents and creates averaged embeddings for each document.
    If a document is small enough, uses it directly. If too large, chunks it
    and averages the embeddings.
    """
    processed_docs = []
    print(f"Processing {len(documents)} documents with chunking strategy...")
    
    # Create progress bar for document processing
    doc_progress = tqdm(documents, desc="Processing documents", unit="doc")
    
    for doc in doc_progress:
        if not doc.page_content.strip():
            doc_progress.set_postfix(status="Skipping empty")
            print(f"Skipping empty document: {doc.metadata.get('source', 'N/A')}")
            continue
            
        try:
            source_file = doc.metadata.get('source', 'N/A')
            doc_progress.set_postfix(status=f"Processing: {os.path.basename(source_file)}")
            
            # Check if document needs chunking
            if len(doc.page_content) <= CHUNK_SIZE:
                # Document is small enough, use as-is
                processed_docs.append(doc)
                print(f"Document {source_file} is small enough, using directly")
            else:
                # Document is too large, chunk it and average embeddings
                chunks = text_splitter.split_text(doc.page_content)
                print(f"Document {source_file} chunked into {len(chunks)} pieces")
                
                # Get embeddings for each chunk with progress bar
                chunk_embeddings = []
                chunk_progress = tqdm(enumerate(chunks), 
                                    total=len(chunks), 
                                    desc=f"Embedding chunks for {os.path.basename(source_file)}", 
                                    unit="chunk", 
                                    leave=False)
                
                for i, chunk in chunk_progress:
                    if chunk.strip():  # Skip empty chunks
                        try:
                            chunk_progress.set_postfix(status=f"Chunk {i+1}/{len(chunks)}")

                            # embedding = embeddings_model.embed_query(chunk) # if you are using hugging face
                            embedding = embeddings_model.encode(chunk, max_length=CHUNK_SIZE)['dense_vecs']

                            chunk_embeddings.append(np.array(embedding))
                        except Exception as e:
                            print(f"Error embedding chunk {i} of {source_file}: {e}")
                
                chunk_progress.close()
                
                if chunk_embeddings:
                    # Average the embeddings
                    avg_embedding = np.mean(chunk_embeddings, axis=0)
                    
                    # Create a document with the original content but store the averaged embedding
                    # We'll store the embedding in metadata for later use
                    processed_doc = Document(
                        page_content=doc.page_content,  # Keep original content
                        metadata={
                            **doc.metadata,
                            "avg_embedding": avg_embedding.tolist(),  # Store as list for JSON serialization
                            "num_chunks": len(chunks)
                        }
                    )
                    processed_docs.append(processed_doc)
                    doc_progress.set_postfix(status=f"✓ Completed: {len(chunks)} chunks averaged")
                else:
                    print(f"Failed to create embeddings for {source_file}")
                    doc_progress.set_postfix(status="✗ Failed")
                    
        except Exception as e:
            print(f"Error processing document {doc.metadata.get('source', 'N/A')}: {e}")
            doc_progress.set_postfix(status="✗ Error")

    doc_progress.close()
    print(f"Finished processing documents. {len(processed_docs)} documents ready for indexing.")
    return processed_docs

In [None]:
def _index_logic(directory_path: str) -> Union[Qdrant, str]:
    """
    Contains the core logic for the indexing process.
    This function is called by the LangGraph tool.
    """
    if not os.path.isdir(directory_path):
        return f"Error: The provided path '{directory_path}' is not a valid directory."

    print(f"Starting to process directory: {directory_path}")

    # 1. Get all files from the directory, regardless of extension
    file_paths = get_all_files_from_directory(directory_path)
    if not file_paths:
        return "No files found in the directory."
    print(f"Found {len(file_paths)} files to process.")

    # 2. Load the content of all readable documents
    documents = load_documents(file_paths)
    if not documents:
        return "Could not load any readable text content from the files found."
    print(f"Successfully loaded content from {len(documents)} readable files.")

    # 3. Process documents with chunking strategy (no LLM summarization)
    processed_docs = chunk_and_embed_documents(documents)
    if not processed_docs:
        return "Failed to process documents with chunking strategy. Aborting."

    # 4. Create an in-memory Qdrant vector store
    collection_name = "directory_documents"
    print(f"Creating in-memory Qdrant collection: '{collection_name}'")
    try:
        qdrant = Qdrant.from_documents(
            processed_docs,
            embeddings_model,
            location=":memory:",  # Specifies an in-memory database
            collection_name=collection_name,
        )
        print("Successfully created Qdrant vector store in memory.")
        return qdrant
    except Exception as e:
        return f"An error occurred while creating the Qdrant vector store: {e}"

In [None]:
# Global variable to store the vector store for reuse
global_vector_store = None

@tool
def index_directory(directory_path: str) -> str:
    """
    Processes all readable files in a directory using chunking strategy for large files
    and averaged embeddings. No LLM calls during indexing - only HuggingFace embeddings.
    Args:
        directory_path: The absolute path to the directory to be indexed.
    Returns:
        A success or error message.
    """
    global global_vector_store
    result = _index_logic(directory_path)
    if isinstance(result, str):
        return result  # Return error message
    else:
        # Store the vector store globally for the RAG tool to use
        global_vector_store = result
        return f"Successfully processed directory '{directory_path}'. Documents are now indexed with chunked embeddings."

@tool
def search_documents(query: str, k: int = 5) -> str:
    """
    Search through the indexed documents using semantic similarity.
    Args:
        query: The search query
        k: Number of results to return (default 5)
    Returns:
        Search results with file paths and relevant content
    """
    global global_vector_store
    
    if global_vector_store is None:
        return "No documents have been indexed yet. Please run index_directory first."
    
    try:
        search_results = global_vector_store.similarity_search(query, k=k)
        
        if not search_results:
            return f"No relevant documents found for query: '{query}'"
        
        results_text = f"Found {len(search_results)} relevant documents for query: '{query}'\n\n"
        
        for i, doc in enumerate(search_results, 1):
            source = doc.metadata.get('source', 'Unknown')
            content_preview = doc.page_content[:500] + "..."
            num_chunks = doc.metadata.get('num_chunks', 'N/A')
            
            results_text += f"Result {i}:\n"
            results_text += f"File: {source}\n"
            if num_chunks != 'N/A':
                results_text += f"Chunks: {num_chunks}\n"
            results_text += f"Content: {content_preview}\n"
            results_text += "-" * 80 + "\n\n"
        
        return results_text
        
    except Exception as e:
        return f"Error searching documents: {e}"

In [None]:
# Define the state for our agent
class AgentState(TypedDict):
    messages: Annotated[list, add_messages]
    query: str
    search_results: str
    needs_local_knowledge: bool

# Decision function using Ollama LLM
def decide_knowledge_source(state: AgentState) -> str:
    """Use Ollama LLM to decide if the question requires local knowledge from vector store"""
    query = state["messages"][-1].content
    
    decision_prompt = f"""You are an intelligent assistant that determines whether a user's question requires local knowledge from a vector database containing files and documents from a directory, or if it can be answered with general knowledge.

User Question: "{query}"

Consider the following:
- Questions about specific files, code, implementations, or content within a directory require LOCAL KNOWLEDGE
- Questions about general concepts, explanations, tutorials, or broad topics can be answered with GENERAL KNOWLEDGE
- Questions asking "what is in this directory", "show me files", "explain this code" require LOCAL KNOWLEDGE
- Questions like "what is Python?", "how does machine learning work?", "explain REST APIs" need GENERAL KNOWLEDGE

Respond with EXACTLY one word: either "LOCAL" or "GENERAL"

Decision:"""

    try:
        response = llm.complete(decision_prompt)
        decision = response.text.strip().upper()
        
        if "LOCAL" in decision:
            return "use_vector_store"
        else:
            return "direct_response"
    except Exception as e:
        print(f"Error in decision making: {e}")
        return "use_vector_store"

def search_step(state: AgentState) -> AgentState:
    """Search for relevant documents in vector store"""
    query = state["messages"][-1].content
    search_results = search_documents(query)
    
    return {
        "messages": state["messages"],
        "query": query,
        "search_results": search_results,
        "needs_local_knowledge": True
    }

def vector_response_step(state: AgentState) -> AgentState:
    """Generate response using vector store search results"""
    query = state["messages"][-1].content
    search_results = state.get("search_results", "")
    
    if search_results and "No relevant documents found" not in search_results:
        # Use Ollama to generate a response based on the search results
        response_prompt = f"""Based on the following search results from local files and documents, answer the user's question comprehensively.

User Question: {query}

Search Results:
{search_results}

Please provide a helpful answer based on the information found in the local documents. If the search results don't fully answer the question, say so and provide what information is available.

Answer:"""
        
        try:
            ollama_response = llm.complete(response_prompt)
            response_content = f"Based on the local documents:\n\n{ollama_response.text}"
        except Exception as e:
            response_content = f"Found local documents but error generating response: {e}\n\nRaw search results:\n{search_results}"
    else:
        response_content = "No relevant documents found in the local directory for your question."
    
    from langchain_core.messages import AIMessage
    return {
        "messages": state["messages"] + [AIMessage(content=response_content)],
        "query": query,
        "search_results": search_results,
        "needs_local_knowledge": True
    }

def direct_response_step(state: AgentState) -> AgentState:
    """Generate direct response using Ollama without vector store"""
    query = state["messages"][-1].content
    
    try:
        # Direct response from Ollama for general knowledge questions
        response = llm.complete(query)
        response_content = response.text
    except Exception as e:
        response_content = f"Error generating response: {e}"
    
    from langchain_core.messages import AIMessage
    return {
        "messages": state["messages"] + [AIMessage(content=response_content)],
        "query": query,
        "search_results": "",
        "needs_local_knowledge": False
    }

def route_query(state: AgentState) -> AgentState:
    """Initial routing step"""
    return {
        "messages": state["messages"],
        "query": state["messages"][-1].content,
        "search_results": "",
        "needs_local_knowledge": False
    }

# Create the LangGraph with improved decision making
def create_rag_agent():
    """Create a LangGraph agent with LLM-based decision making"""
    workflow = StateGraph(AgentState)
    
    # Add nodes
    workflow.add_node("route", route_query)
    workflow.add_node("search", search_step)
    workflow.add_node("vector_response", vector_response_step)
    workflow.add_node("direct_response", direct_response_step)
    
    # Set entry point
    workflow.set_entry_point("route")
    
    # Add conditional edges from route using LLM decision
    workflow.add_conditional_edges(
        "route",
        decide_knowledge_source,  # LLM-based decision function
        {
            "use_vector_store": "search",
            "direct_response": "direct_response"
        }
    )
    
    # After search, go to vector response
    workflow.add_edge("search", "vector_response")
    
    # Both response types end the workflow
    workflow.add_edge("vector_response", END)
    workflow.add_edge("direct_response", END)
    
    return workflow.compile()

# Create the agent
rag_agent = create_rag_agent()

print("RAG Agent with LLM-based decision making created successfully!")

In [None]:
print("=== RAG Agent Workflow Diagram ===")

try:
    # Generate and display the workflow diagram
    from IPython.display import Image, display
    
    # Get the diagram as PNG bytes
    diagram_png = rag_agent.get_graph().draw_mermaid_png()
    
    # Display the diagram
    display(Image(diagram_png))
    print("Workflow diagram displayed above.")
    
except Exception as e:
    print(f"Could not generate diagram: {e}")
    
    # Fallback: print text representation of the workflow
    print("\nWorkflow Structure (Text):")
    print("1. route (entry point)")
    print("2. decide_knowledge_source() -> LOCAL or GENERAL")
    print("3a. If LOCAL: search -> vector_response -> END")
    print("3b. If GENERAL: direct_response -> END")
    print("\nNodes:")
    print("- route: Initial query processing")
    print("- search: Vector database search")
    print("- vector_response: LLM response with search results")
    print("- direct_response: Direct LLM response")

In [None]:
# First, index a directory 
print("Indexing directory...")
result = index_directory("../.")
print(result) 

In [None]:
all_payloads = []
while True:
    points, offset = global_vector_store.client.scroll(
        collection_name=global_vector_store.collection_name,
        with_payload=True,
    )
    all_payloads.extend([p.payload for p in points])
    if offset is None:
        break
    print([p.payload for p in points])

all_payloads

In [None]:
# Test 1: Local knowledge question (should use vector store)
print("\n--- Test 1: Local Knowledge Question ---")
local_query = "What file should I look at when working with SmolAgents?"
local_state = {
    "messages": [HumanMessage(content=local_query)],
    "query": "",
    "search_results": "",
    "needs_local_knowledge": False
}

result = rag_agent.invoke(local_state)
print(f"Query: {local_query}")
print(f"Response: {result['messages'][-1]}...")
print(f"Used local knowledge: {result['needs_local_knowledge']}")

print("\n" + "="*50)

In [None]:
# Test 2: General knowledge question (should use direct Ollama)
print("\n--- Test 2: General Knowledge Question ---")
general_query = "Explain history of Prussia briefly."
general_state = {
    "messages": [HumanMessage(content=general_query)],
    "query": "",
    "search_results": "",
    "needs_local_knowledge": False
}

result = rag_agent.invoke(general_state)
print(f"Query: {general_query}")
print(f"Response: {result['messages'][-1]}...")
print(f"Used local knowledge: {result['needs_local_knowledge']}")

print("\n" + "="*50)

In [None]:

# Test 3: Ambiguous question to test decision making
print("\n--- Test 3: Ambiguous Question ---")
ambiguous_query = "How do I implement a function?"
ambiguous_state = {
    "messages": [HumanMessage(content=ambiguous_query)],
    "query": "",
    "search_results": "",
    "needs_local_knowledge": False
}

result = rag_agent.invoke(ambiguous_state)
print(f"Query: {ambiguous_query}")
print(f"Response: {result['messages'][-1]}...")
print(f"Used local knowledge: {result['needs_local_knowledge']}")

print("\n=== Enhanced RAG System Test Complete ===")