In [1]:
import os
from typing import List, Any, Union
from langchain_community.document_loaders import (
    TextLoader,
    PyPDFLoader,
)
from langchain.docstore.document import Document
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.tools import tool
from qdrant_client import QdrantClient
from llama_index.llms.ollama import Ollama

In [2]:
SPECIALIZED_LOADERS = {
    ".pdf": PyPDFLoader,
}

REQUEST_TIMEOUT = 300
CONTEXT_WINDOW = 80000
MODEL_NAME = "qwen2:7b"

FOLDERS_TO_EXLUDE = [".claude/", ".conda", ".gradio/", "__pycache__", ".git", ".DS_Store"]


In [3]:
# Initialize LLM
llm = Ollama(
    model=MODEL_NAME, 
    context_window=CONTEXT_WINDOW, 
    request_timeout=REQUEST_TIMEOUT
)

# Test connection
test_response = llm.complete("Hello")
print(f"Model initialized: {test_response.text[:50]}...")

Model initialized: Hello! How can I assist you today? Let me know if ...


In [4]:
def get_all_files_from_directory(directory_path: str) -> List[str]:
    """
    Scans a directory and returns a list of all file paths.
    """
    all_files = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            if not any(folder_to_exclude in file_path for folder_to_exclude in FOLDERS_TO_EXLUDE):
                all_files.append(file_path)
    return all_files

def load_documents(file_paths: List[str]) -> List[Document]:
    """
    Loads documents from a list of file paths using the appropriate loader.
    Defaults to TextLoader for any unrecognized file type.
    """
    documents = []
    for file_path in file_paths:
        _, extension = os.path.splitext(file_path)
        loader_class = SPECIALIZED_LOADERS.get(extension)

        try:
            if loader_class:
                loader = loader_class(file_path)
            else:
                # Default to TextLoader for all other files 
                loader = TextLoader(file_path, encoding='utf-8')
            documents.extend(loader.load())
        except Exception as e:
            # This will catch errors for true binary files that can't be decoded
            print(f"Skipping file {file_path}, could not be read as text. Error: {e}")
            continue

    return documents

def generate_summaries(documents: List[Document]) -> List[Document]:
    """
    Generates extensive summaries for a list of documents using an LLM.
    """

    summary_docs = []
    print(f"Generating summaries for {len(documents)} documents...")
    for doc in documents:
        if not doc.page_content.strip():
            print(f"Skipping empty document: {doc.metadata.get('source', 'N/A')}")
            continue
        try:
            # Create a summarization chain
            prompt_template =f"""Write a summary of the following text.
            Capture the key topics, arguments, important entities, and the main purpose of the document.
            The summary should be comprehensive enough to replace the original text for the purpose of a semantic search.
            Aim for 200 words summary. 

            TEXT:
            "{doc.page_content}"

            EXTENSIVE SUMMARY:"""
            resp = llm.complete(prompt_template)
            # Create a new document with the summary and original metadata
            summary_doc = Document(
                page_content=resp.text,
                metadata={"source": doc.metadata.get("source", "N/A")}
            )
            summary_docs.append(summary_doc)
        except Exception as e:
            print(f"Error summarizing document {doc.metadata.get('source', 'N/A')}: {e}")

    print("Finished generating summaries.")
    return summary_docs

In [5]:
def _index_logic(directory_path: str) -> Union[Qdrant, str]:
    """
    Contains the core logic for the indexing process.
    This function is called by the LangGraph tool.
    """
    if not os.path.isdir(directory_path):
        return f"Error: The provided path '{directory_path}' is not a valid directory."

    print(f"Starting to process directory: {directory_path}")

    # 1. Get all files from the directory, regardless of extension
    file_paths = get_all_files_from_directory(directory_path)
    if not file_paths:
        return "No files found in the directory."
    print(f"Found {len(file_paths)} files to process.")

    # 2. Load the content of all readable documents
    documents = load_documents(file_paths)
    if not documents:
        return "Could not load any readable text content from the files found."
    print(f"Successfully loaded content from {len(documents)} readable files.")

    # 3. Generate summaries for each document
    summary_docs = generate_summaries(documents)
    if not summary_docs:
        return "Failed to generate summaries for the documents. Aborting."

    # 4. Initialize free embeddings from Hugging Face
    print("Initializing Hugging Face embeddings...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # 5. Create an in-memory Qdrant vector store
    collection_name = "directory_summaries"
    print(f"Creating in-memory Qdrant collection: '{collection_name}'")
    try:
        qdrant = Qdrant.from_documents(
            summary_docs,
            embeddings,
            location=":memory:",  # Specifies an in-memory database
            collection_name=collection_name,
        )
        print("Successfully created Qdrant vector store in memory.")
        return qdrant
    except Exception as e:
        return f"An error occurred while creating the Qdrant vector store: {e}"


In [6]:
@tool
def index_directory_with_summaries(directory_path: str) -> str:
    """
    Processes all readable files in a directory (code, text, pdf, etc.), creates
    extensive summaries of their content, and indexes these summaries in an
    in-memory Qdrant vector store using Hugging Face embeddings. The original
    file path is stored as metadata.
    Args:
        directory_path: The absolute path to the directory to be indexed.
    Returns:
        A success or error message.
    """
    result = _index_logic(directory_path)
    if isinstance(result, str):
        return result  # Return error message
    else:
        # The actual vector store object is held in memory.
        # For a tool, we return a success message.
        return f"Successfully processed directory '{directory_path}'. Summaries are now indexed in an in-memory Qdrant collection."

In [None]:
vector_store = _index_logic("../.")

all_payloads = []
while True:
    points, offset = vector_store.client.scroll(
        collection_name=vector_store.collection_name,
        with_payload=True,
    )
    all_payloads.extend([p.payload for p in points])
    if offset is None:
        break
all_payloads

--- Running Indexing Logic ---
Starting to process directory: ../.
Found 216 files to process.
Successfully loaded content from 216 readable files.
Generating summaries for 216 documents...


KeyboardInterrupt: 

In [None]:
# --- Test the in-memory index ---
if isinstance(vector_store, Qdrant):
    print("\n--- Testing the In-Memory Index ---")
    query = "What is the python script about?"
    search_results = vector_store.similarity_search(query)

    print(f"\nQuery: '{query}'")
    print("Search Results (from summaries):")
    for doc in search_results:
        print(f"\nSummary: {doc.page_content}")
        print(f"Original File: {doc.metadata['source']}")
else:
    # Print the error message if something went wrong
    print(f"\n--- Tool Execution Failed ---")
    print(vector_store)

True