In [1]:
import os
import numpy as np
from typing import List, Any, Union
from langchain_community.document_loaders import (
    TextLoader,
    PyPDFLoader,
)
from langchain.docstore.document import Document
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.tools import tool
from qdrant_client import QdrantClient
from langchain.prompts import PromptTemplate
from langgraph.graph import StateGraph, END
from langgraph.graph.message import add_messages
from typing import Annotated
from typing_extensions import TypedDict
from llama_index.llms.ollama import Ollama

In [4]:
SPECIALIZED_LOADERS = {
    ".pdf": PyPDFLoader,
}

# Configuration for chunking and embeddings
CHUNK_SIZE = 1000  # Characters per chunk
CHUNK_OVERLAP = 200  # Overlap between chunks
MAX_INPUT_SIZE = 512  # Max tokens for embedding model (adjusted for long-input model)

REQUEST_TIMEOUT = 300
CONTEXT_WINDOW = 80000
MODEL_NAME = "qwen2:7b"

FOLDERS_TO_EXCLUDE = [".claude/", ".conda", ".gradio/", "__pycache__", ".git", ".DS_Store"]

In [5]:
llm = Ollama(
    model=MODEL_NAME, 
    context_window=CONTEXT_WINDOW, 
    request_timeout=REQUEST_TIMEOUT
)

# Test connection
test_response = llm.complete("Hello")
print(f"Model initialized: {test_response.text[:50]}...")

Model initialized: Hello! How can I assist you today?...


In [None]:
# Initialize HuggingFace embeddings model designed for longer inputs
# Using a model optimized for longer sequences
embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",  # Better for longer texts
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# Initialize text splitter for chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

print("Embeddings model and text splitter initialized successfully.")

In [None]:
def get_all_files_from_directory(directory_path: str) -> List[str]:
    """
    Scans a directory and returns a list of all file paths.
    """
    all_files = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            if not any(folder_to_exclude in file_path for folder_to_exclude in FOLDERS_TO_EXCLUDE):
                all_files.append(file_path)
    return all_files

def load_documents(file_paths: List[str]) -> List[Document]:
    """
    Loads documents from a list of file paths using the appropriate loader.
    Defaults to TextLoader for any unrecognized file type.
    """
    documents = []
    for file_path in file_paths:
        _, extension = os.path.splitext(file_path)
        loader_class = SPECIALIZED_LOADERS.get(extension)

        try:
            if loader_class:
                loader = loader_class(file_path)
            else:
                # Default to TextLoader for all other files 
                loader = TextLoader(file_path, encoding='utf-8')
            documents.extend(loader.load())
        except Exception as e:
            # This will catch errors for true binary files that can't be decoded
            print(f"Skipping file {file_path}, could not be read as text. Error: {e}")
            continue

    return documents

def chunk_and_embed_documents(documents: List[Document]) -> List[Document]:
    """
    Chunks large documents and creates averaged embeddings for each document.
    If a document is small enough, uses it directly. If too large, chunks it
    and averages the embeddings.
    """
    processed_docs = []
    print(f"Processing {len(documents)} documents with chunking strategy...")
    
    for doc in documents:
        if not doc.page_content.strip():
            print(f"Skipping empty document: {doc.metadata.get('source', 'N/A')}")
            continue
            
        try:
            # Check if document needs chunking
            if len(doc.page_content) <= CHUNK_SIZE:
                # Document is small enough, use as-is
                processed_docs.append(doc)
                print(f"Document {doc.metadata.get('source', 'N/A')} is small enough, using directly")
            else:
                # Document is too large, chunk it and average embeddings
                chunks = text_splitter.split_text(doc.page_content)
                print(f"Document {doc.metadata.get('source', 'N/A')} chunked into {len(chunks)} pieces")
                
                # Get embeddings for each chunk
                chunk_embeddings = []
                for i, chunk in enumerate(chunks):
                    if chunk.strip():  # Skip empty chunks
                        try:
                            # Get embedding for this chunk
                            embedding = embeddings_model.embed_query(chunk)
                            chunk_embeddings.append(np.array(embedding))
                        except Exception as e:
                            print(f"Error embedding chunk {i} of {doc.metadata.get('source', 'N/A')}: {e}")
                
                if chunk_embeddings:
                    # Average the embeddings
                    avg_embedding = np.mean(chunk_embeddings, axis=0)
                    
                    # Create a document with the original content but store the averaged embedding
                    # We'll store the embedding in metadata for later use
                    processed_doc = Document(
                        page_content=doc.page_content,  # Keep original content
                        metadata={
                            **doc.metadata,
                            "avg_embedding": avg_embedding.tolist(),  # Store as list for JSON serialization
                            "num_chunks": len(chunks)
                        }
                    )
                    processed_docs.append(processed_doc)
                else:
                    print(f"Failed to create embeddings for {doc.metadata.get('source', 'N/A')}")
                    
        except Exception as e:
            print(f"Error processing document {doc.metadata.get('source', 'N/A')}: {e}")

    print(f"Finished processing documents. {len(processed_docs)} documents ready for indexing.")
    return processed_docs

In [None]:
def _index_logic(directory_path: str) -> Union[Qdrant, str]:
    """
    Contains the core logic for the indexing process.
    This function is called by the LangGraph tool.
    """
    if not os.path.isdir(directory_path):
        return f"Error: The provided path '{directory_path}' is not a valid directory."

    print(f"Starting to process directory: {directory_path}")

    # 1. Get all files from the directory, regardless of extension
    file_paths = get_all_files_from_directory(directory_path)
    if not file_paths:
        return "No files found in the directory."
    print(f"Found {len(file_paths)} files to process.")

    # 2. Load the content of all readable documents
    documents = load_documents(file_paths)
    if not documents:
        return "Could not load any readable text content from the files found."
    print(f"Successfully loaded content from {len(documents)} readable files.")

    # 3. Process documents with chunking strategy (no LLM summarization)
    processed_docs = chunk_and_embed_documents(documents)
    if not processed_docs:
        return "Failed to process documents with chunking strategy. Aborting."

    # 4. Create an in-memory Qdrant vector store
    collection_name = "directory_documents"
    print(f"Creating in-memory Qdrant collection: '{collection_name}'")
    try:
        qdrant = Qdrant.from_documents(
            processed_docs,
            embeddings_model,
            location=":memory:",  # Specifies an in-memory database
            collection_name=collection_name,
        )
        print("Successfully created Qdrant vector store in memory.")
        return qdrant
    except Exception as e:
        return f"An error occurred while creating the Qdrant vector store: {e}"

In [None]:
# Global variable to store the vector store for reuse
global_vector_store = None

@tool
def index_directory(directory_path: str) -> str:
    """
    Processes all readable files in a directory using chunking strategy for large files
    and averaged embeddings. No LLM calls during indexing - only HuggingFace embeddings.
    Args:
        directory_path: The absolute path to the directory to be indexed.
    Returns:
        A success or error message.
    """
    global global_vector_store
    result = _index_logic(directory_path)
    if isinstance(result, str):
        return result  # Return error message
    else:
        # Store the vector store globally for the RAG tool to use
        global_vector_store = result
        return f"Successfully processed directory '{directory_path}'. Documents are now indexed with chunked embeddings."

@tool
def search_documents(query: str, k: int = 5) -> str:
    """
    Search through the indexed documents using semantic similarity.
    Args:
        query: The search query
        k: Number of results to return (default 5)
    Returns:
        Search results with file paths and relevant content
    """
    global global_vector_store
    
    if global_vector_store is None:
        return "No documents have been indexed yet. Please run index_directory first."
    
    try:
        search_results = global_vector_store.similarity_search(query, k=k)
        
        if not search_results:
            return f"No relevant documents found for query: '{query}'"
        
        results_text = f"Found {len(search_results)} relevant documents for query: '{query}'\n\n"
        
        for i, doc in enumerate(search_results, 1):
            source = doc.metadata.get('source', 'Unknown')
            content_preview = doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content
            num_chunks = doc.metadata.get('num_chunks', 'N/A')
            
            results_text += f"Result {i}:\n"
            results_text += f"File: {source}\n"
            if num_chunks != 'N/A':
                results_text += f"Chunks: {num_chunks}\n"
            results_text += f"Content: {content_preview}\n"
            results_text += "-" * 80 + "\n\n"
        
        return results_text
        
    except Exception as e:
        return f"Error searching documents: {e}"

In [None]:
# Define the state for our agent
class AgentState(TypedDict):
    messages: Annotated[list, add_messages]
    query: str
    search_results: str

# Agent functions
def should_search(state: AgentState) -> str:
    """Determine if we need to search for documents"""
    last_message = state["messages"][-1]
    # Simple logic: if the message contains questions about files or code, search
    search_keywords = ["what", "how", "where", "find", "show", "explain", "code", "file", "script", "function"]
    if any(keyword in last_message.content.lower() for keyword in search_keywords):
        return "search"
    return "respond"

def search_step(state: AgentState) -> AgentState:
    """Search for relevant documents"""
    query = state["messages"][-1].content
    search_results = search_documents(query)
    
    return {
        "messages": state["messages"],
        "query": query,
        "search_results": search_results
    }

def respond_step(state: AgentState) -> AgentState:
    """Generate response with or without search results"""
    query = state["messages"][-1].content
    
    if state.get("search_results"):
        # We have search results, use them in the response
        response_content = f"Based on the documents in the indexed directory:\n\n{state['search_results']}"
    else:
        # No search results, general response
        response_content = "I can help you search through indexed documents. Please index a directory first using the index_directory function, then ask questions about the contents."
    
    from langchain_core.messages import AIMessage
    return {
        "messages": state["messages"] + [AIMessage(content=response_content)],
        "query": query,
        "search_results": state.get("search_results", "")
    }

# Create the LangGraph
def create_rag_agent():
    """Create a LangGraph agent with RAG capabilities"""
    workflow = StateGraph(AgentState)
    
    workflow.add_node("search_step", search_step)
    workflow.add_node("respond_step", respond_step)

    workflow.set_entry_point("search_step")

    workflow.add_conditional_edges(
        "search_step",
        lambda x: "respond_step",  
        {"respond": "respond_step"}
    )
    
    # Set entry point
    workflow.add_edge("respond_step", END)
    
    return workflow.compile()

# Create the agent
rag_agent = create_rag_agent()


print("RAG Agent with LangGraph created successfully!")

In [None]:
# Test the new RAG system
print("=== Testing the New RAG System ===")

# First, index a directory 
print("\n1. Indexing directory...")
result = index_directory("../.")
print(result)

# Test search functionality
print("\n2. Testing search...")
search_result = search_documents("python script functionality", k=3)
print(search_result)

# Test the LangGraph agent
print("\n3. Testing LangGraph Agent...")
from langchain_core.messages import HumanMessage

# Test query
test_query = "How do i install this directory as python project?"
initial_state = {
    "messages": [HumanMessage(content=test_query)],
    "query": "",
    "search_results": ""
}

# Run the agent
result = rag_agent.invoke(initial_state)
print(f"Agent Response: {result['messages'][-1].content}")

print("\n=== RAG System Test Complete ===")