In [8]:
import os
from dotenv import load_dotenv
load_dotenv()
from os import environ

In [1]:
import re, os
import tiktoken

from bs4 import BeautifulSoup

from langchain_community.document_loaders import RecursiveUrlLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_anthropic import ChatAnthropic
from langchain_community.vectorstores import SKLearnVectorStore

def count_tokens(text, model="cl100k_base"):
    """
    Count the number of tokens in the text using tiktoken.
    
    Args:
        text (str): The text to count tokens for
        model (str): The tokenizer model to use (default: cl100k_base for GPT-4)
        
    Returns:
        int: Number of tokens in the text
    """
    encoder = tiktoken.get_encoding(model)
    return len(encoder.encode(text))

def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    
    # Target the main article content for LangGraph documentation 
    main_content = soup.find("article", class_="md-content__inner")
    
    # If found, use that, otherwise fall back to the whole document
    content = main_content.get_text() if main_content else soup.text
    
    # Clean up whitespace
    content = re.sub(r"\n\n+", "\n\n", content).strip()
    
    return content

def load_langgraph_docs():
    """
    Load LangGraph documentation from the official website.
    
    This function:
    1. Uses RecursiveUrlLoader to fetch pages from the LangGraph website
    2. Counts the total documents and tokens loaded
    
    Returns:
        list: A list of Document objects containing the loaded content
        list: A list of tokens per document
    """
    print("Loading LangGraph documentation...")

    # Load the documentation 
    urls = ["https://langchain-ai.github.io/langgraph/concepts/",
     "https://langchain-ai.github.io/langgraph/how-tos/",
     "https://langchain-ai.github.io/langgraph/tutorials/workflows/",  
     "https://langchain-ai.github.io/langgraph/tutorials/introduction/",
     "https://langchain-ai.github.io/langgraph/tutorials/langgraph-platform/local-server/",
    ] 

    docs = []
    for url in urls:

        loader = RecursiveUrlLoader(
            url,
            max_depth=5,
            extractor=bs4_extractor,
        )

        # Load documents using lazy loading (memory efficient)
        docs_lazy = loader.lazy_load()

        # Load documents and track URLs
        for d in docs_lazy:
            docs.append(d)

    print(f"Loaded {len(docs)} documents from LangGraph documentation.")
    print("\nLoaded URLs:")
    for i, doc in enumerate(docs):
        print(f"{i+1}. {doc.metadata.get('source', 'Unknown URL')}")
    
    # Count total tokens in documents
    total_tokens = 0
    tokens_per_doc = []
    for doc in docs:
        total_tokens += count_tokens(doc.page_content)
        tokens_per_doc.append(count_tokens(doc.page_content))
    print(f"Total tokens in loaded documents: {total_tokens}")
    
    return docs, tokens_per_doc

def save_llms_full(documents):
    """ Save the documents to a file """

    # Open the output file
    output_filename = "llms_full.txt"

    with open(output_filename, "w") as f:
        # Write each document
        for i, doc in enumerate(documents):
            # Get the source (URL) from metadata
            source = doc.metadata.get('source', 'Unknown URL')
            
            # Write the document with proper formatting
            f.write(f"DOCUMENT {i+1}\n")
            f.write(f"SOURCE: {source}\n")
            f.write("CONTENT:\n")
            f.write(doc.page_content)
            f.write("\n\n" + "="*80 + "\n\n")

    print(f"Documents concatenated into {output_filename}")

def split_documents(documents):
    """
    Split documents into smaller chunks for improved retrieval.
    
    This function:
    1. Uses RecursiveCharacterTextSplitter with tiktoken to create semantically meaningful chunks
    2. Ensures chunks are appropriately sized for embedding and retrieval
    3. Counts the resulting chunks and their total tokens
    
    Args:
        documents (list): List of Document objects to split
        
    Returns:
        list: A list of split Document objects
    """
    print("Splitting documents...")
    
    # Initialize text splitter using tiktoken for accurate token counting
    # chunk_size=8,000 creates relatively large chunks for comprehensive context
    # chunk_overlap=500 ensures continuity between chunks
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=8000,  
        chunk_overlap=500  
    )
    
    # Split documents into chunks
    split_docs = text_splitter.split_documents(documents)
    
    print(f"Created {len(split_docs)} chunks from documents.")
    
    # Count total tokens in split documents
    total_tokens = 0
    for doc in split_docs:
        total_tokens += count_tokens(doc.page_content)
    
    print(f"Total tokens in split documents: {total_tokens}")
    
    return split_docs

def create_vectorstore(splits):
    """
    Create a vector store from document chunks using SKLearnVectorStore.
    
    This function:
    1. Initializes an embedding model to convert text into vector representations
    2. Creates a vector store from the document chunks
    
    Args:
        splits (list): List of split Document objects to embed
        
    Returns:
        SKLearnVectorStore: A vector store containing the embedded documents
    """
    print("Creating SKLearnVectorStore...")
    
    # Initialize OpenAI embeddings
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    
    # Create vector store from documents using SKLearn
    persist_path = os.getcwd()+"/sklearn_vectorstore.parquet"
    vectorstore = SKLearnVectorStore.from_documents(
        documents=splits,
        embedding=embeddings,
        persist_path=persist_path   ,
        serializer="parquet",
    )
    print("SKLearnVectorStore created successfully.")
    
    vectorstore.persist()
    print("SKLearnVectorStore was persisted to", persist_path)

    return vectorstore

In [5]:
# Load the documents
documents, tokens_per_doc = load_langgraph_docs()

# Save the documents to a file
save_llms_full(documents)

# Split the documents
split_docs = split_documents(documents)

# Create the vector store
vectorstore = create_vectorstore(split_docs)

Loading LangGraph documentation...
Loaded 5 documents from LangGraph documentation.

Loaded URLs:
1. https://langchain-ai.github.io/langgraph/concepts/
2. https://langchain-ai.github.io/langgraph/how-tos/
3. https://langchain-ai.github.io/langgraph/tutorials/workflows/
4. https://langchain-ai.github.io/langgraph/tutorials/introduction/
5. https://langchain-ai.github.io/langgraph/tutorials/langgraph-platform/local-server/
Total tokens in loaded documents: 9375
Documents concatenated into llms_full.txt
Splitting documents...
Created 6 chunks from documents.
Total tokens in split documents: 9686
Creating SKLearnVectorStore...
SKLearnVectorStore created successfully.
SKLearnVectorStore was persisted to /Users/eshanjain/Desktop/ragnarok/sklearn_vectorstore.parquet


In [6]:
# Create retriever to get relevant documents (k=3 means return top 3 matches)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    
# Get relevant documents for the query
query = "What is LangGraph?"    
relevant_docs = retriever.invoke(query)
print(f"Retrieved {len(relevant_docs)} relevant documents")

for d in relevant_docs:
    print(d.metadata['source'])
    print(d.page_content[0:500])
    print("\n--------------------------------\n")

Retrieved 3 relevant documents
https://langchain-ai.github.io/langgraph/tutorials/workflows/
# Write the updated section to completed sections
    return result.content

@task
def synthesizer(completed_sections: list[str]):
    """Synthesize full report from sections"""
    final_report = "\n\n---\n\n".join(completed_sections)
    return final_report

@entrypoint()
def orchestrator_worker(topic: str):
    sections = orchestrator(topic).result()
    section_futures = [llm_call(section) for section in sections]
    final_report = synthesizer(
        [section_fut.result() for section_fut 

--------------------------------

https://langchain-ai.github.io/langgraph/tutorials/langgraph-platform/local-server/
Run a local server¶
This guide shows you how to run a LangGraph application locally.
Prerequisites¶
Before you begin, ensure you have the following:

An API key for LangSmith - free to sign up

1. Install the LangGraph CLI¶
# Python >= 3.11 is required.

pip install --upgrade "langgraph

In [7]:
from langchain_core.tools import tool

@tool
def langgraph_query_tool(query: str):
    """
    Query the LangGraph documentation using a retriever.
    
    Args:
        query (str): The query to search the documentation with

    Returns:
        str: A str of the retrieved documents
    """
    retriever = SKLearnVectorStore(
    embedding=OpenAIEmbeddings(model="text-embedding-3-large"), 
    persist_path=os.getcwd()+"/sklearn_vectorstore.parquet", 
    serializer="parquet").as_retriever(search_kwargs={"k": 3})

    relevant_docs = retriever.invoke(query)
    print(f"Retrieved {len(relevant_docs)} relevant documents")
    formatted_context = "\n\n".join([f"==DOCUMENT {i+1}==\n{doc.page_content}" for i, doc in enumerate(relevant_docs)])
    return formatted_context

In [10]:
llm = ChatAnthropic(model="claude-3-7-sonnet-latest", temperature=0)
augmented_llm = llm.bind_tools([langgraph_query_tool])

instructions = """You are a helpful assistant that can answer questions about the LangGraph documentation. 
Use the langgraph_query_tool for any questions about the documentation.
If you don't know the answer, say "I don't know."""

messages = [
    {"role": "system", "content": instructions},
    {"role": "user", "content": "What is LangGraph?"}
]

message = augmented_llm.invoke(messages)
message.pretty_print()


[{'text': "I'll help you understand what LangGraph is by searching the documentation.", 'type': 'text'}, {'id': 'toolu_01FMcZUT7FjCyMUeEnTkRwUU', 'input': {'query': 'What is LangGraph'}, 'name': 'langgraph_query_tool', 'type': 'tool_use'}]
Tool Calls:
  langgraph_query_tool (toolu_01FMcZUT7FjCyMUeEnTkRwUU)
 Call ID: toolu_01FMcZUT7FjCyMUeEnTkRwUU
  Args:
    query: What is LangGraph


In [11]:
import re, os
import tiktoken

from bs4 import BeautifulSoup

from langchain_community.document_loaders import RecursiveUrlLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_anthropic import ChatAnthropic
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_core.prompts import ChatPromptTemplate

class LangGraphRAG:
    def __init__(self, vectorstore_path=None):
        """
        Initialize the LangGraph RAG system.
        
        Args:
            vectorstore_path (str): Path to existing vectorstore, if None will create new one
        """
        self.vectorstore_path = vectorstore_path or os.path.join(os.getcwd(), "sklearn_vectorstore.parquet")
        self.embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
        self.llm = ChatAnthropic(model="claude-3-5-sonnet-20241022", temperature=0)
        self.vectorstore = None
        self.retriever = None
        
        # Create the prompt template for RAG
        self.prompt_template = ChatPromptTemplate.from_messages([
            ("system", """You are a helpful assistant that answers questions about LangGraph documentation. 
Use the provided context to answer the user's question accurately and comprehensively.

Context:
{context}

Instructions:
- Answer based primarily on the provided context
- If the context doesn't contain enough information, say so clearly
- Be specific and cite relevant details from the documentation
- If you're unsure, acknowledge the uncertainty"""),
            ("human", "{question}")
        ])
    
    def count_tokens(self, text, model="cl100k_base"):
        """Count the number of tokens in the text using tiktoken."""
        encoder = tiktoken.get_encoding(model)
        return len(encoder.encode(text))

    def bs4_extractor(self, html: str) -> str:
        """Extract text content from HTML using BeautifulSoup."""
        soup = BeautifulSoup(html, "lxml")
        
        # Target the main article content for LangGraph documentation 
        main_content = soup.find("article", class_="md-content__inner")
        
        # If found, use that, otherwise fall back to the whole document
        content = main_content.get_text() if main_content else soup.text
        
        # Clean up whitespace
        content = re.sub(r"\n\n+", "\n\n", content).strip()
        
        return content

    def load_langgraph_docs(self):
        """Load LangGraph documentation from the official website."""
        print("Loading LangGraph documentation...")

        # Load the documentation 
        urls = [
            "https://langchain-ai.github.io/langgraph/concepts/",
            "https://langchain-ai.github.io/langgraph/how-tos/",
            "https://langchain-ai.github.io/langgraph/tutorials/workflows/",  
            "https://langchain-ai.github.io/langgraph/tutorials/introduction/",
            "https://langchain-ai.github.io/langgraph/tutorials/langgraph-platform/local-server/",
        ] 

        docs = []
        for url in urls:
            loader = RecursiveUrlLoader(
                url,
                max_depth=5,
                extractor=self.bs4_extractor,
            )

            # Load documents using lazy loading (memory efficient)
            docs_lazy = loader.lazy_load()

            # Load documents and track URLs
            for d in docs_lazy:
                docs.append(d)

        print(f"Loaded {len(docs)} documents from LangGraph documentation.")
        print("\nLoaded URLs:")
        for i, doc in enumerate(docs):
            print(f"{i+1}. {doc.metadata.get('source', 'Unknown URL')}")
        
        # Count total tokens in documents
        total_tokens = 0
        tokens_per_doc = []
        for doc in docs:
            doc_tokens = self.count_tokens(doc.page_content)
            total_tokens += doc_tokens
            tokens_per_doc.append(doc_tokens)
        print(f"Total tokens in loaded documents: {total_tokens}")
        
        return docs, tokens_per_doc

    def save_docs_to_file(self, documents, filename="llms_full.txt"):
        """Save the documents to a file."""
        with open(filename, "w") as f:
            for i, doc in enumerate(documents):
                source = doc.metadata.get('source', 'Unknown URL')
                f.write(f"DOCUMENT {i+1}\n")
                f.write(f"SOURCE: {source}\n")
                f.write("CONTENT:\n")
                f.write(doc.page_content)
                f.write("\n\n" + "="*80 + "\n\n")

        print(f"Documents saved to {filename}")

    def split_documents(self, documents):
        """Split documents into smaller chunks for improved retrieval."""
        print("Splitting documents...")
        
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=8000,  
            chunk_overlap=500  
        )
        
        split_docs = text_splitter.split_documents(documents)
        
        print(f"Created {len(split_docs)} chunks from documents.")
        
        total_tokens = sum(self.count_tokens(doc.page_content) for doc in split_docs)
        print(f"Total tokens in split documents: {total_tokens}")
        
        return split_docs

    def create_vectorstore(self, splits):
        """Create a vector store from document chunks."""
        print("Creating SKLearnVectorStore...")
        
        vectorstore = SKLearnVectorStore.from_documents(
            documents=splits,
            embedding=self.embeddings,
            persist_path=self.vectorstore_path,
            serializer="parquet",
        )
        print("SKLearnVectorStore created successfully.")
        
        vectorstore.persist()
        print(f"SKLearnVectorStore persisted to {self.vectorstore_path}")

        return vectorstore

    def load_vectorstore(self):
        """Load existing vectorstore from disk."""
        if os.path.exists(self.vectorstore_path):
            print(f"Loading existing vectorstore from {self.vectorstore_path}")
            self.vectorstore = SKLearnVectorStore(
                embedding=self.embeddings,
                persist_path=self.vectorstore_path,
                serializer="parquet"
            )
            self.retriever = self.vectorstore.as_retriever(search_kwargs={"k": 3})
            return True
        else:
            print(f"Vectorstore not found at {self.vectorstore_path}")
            return False

    def setup_rag_system(self, force_rebuild=False):
        """Set up the complete RAG system."""
        if not force_rebuild and self.load_vectorstore():
            print("Using existing vectorstore.")
            return
        
        print("Building new vectorstore...")
        # Load documents
        documents, _ = self.load_langgraph_docs()
        
        # Save documents to file (optional)
        self.save_docs_to_file(documents)
        
        # Split documents
        split_docs = self.split_documents(documents)
        
        # Create vectorstore
        self.vectorstore = self.create_vectorstore(split_docs)
        self.retriever = self.vectorstore.as_retriever(search_kwargs={"k": 3})

    def retrieve_context(self, query: str) -> str:
        """Retrieve relevant context for a query."""
        if not self.retriever:
            raise ValueError("RAG system not initialized. Call setup_rag_system() first.")
        
        relevant_docs = self.retriever.invoke(query)
        print(f"Retrieved {len(relevant_docs)} relevant documents")
        
        # Format context from retrieved documents
        formatted_context = "\n\n".join([
            f"==DOCUMENT {i+1}==\nSource: {doc.metadata.get('source', 'Unknown')}\nContent: {doc.page_content}" 
            for i, doc in enumerate(relevant_docs)
        ])
        
        return formatted_context

    def query(self, question: str) -> str:
        """
        Query the RAG system and get an answer.
        
        Args:
            question (str): The question to ask about LangGraph
            
        Returns:
            str: The answer based on the retrieved context
        """
        if not self.retriever:
            raise ValueError("RAG system not initialized. Call setup_rag_system() first.")
        
        # Retrieve relevant context
        context = self.retrieve_context(question)
        
        # Generate response using the LLM
        messages = self.prompt_template.format_messages(
            context=context,
            question=question
        )
        
        response = self.llm.invoke(messages)
        return response.content




In [12]:
def main():
    """Example usage of the LangGraph RAG system."""
    # Initialize the RAG system
    rag_system = LangGraphRAG()
    
    # Set up the system (loads existing vectorstore or creates new one)
    rag_system.setup_rag_system()
    
    # Example queries
    questions = [
        "What is LangGraph?",
    ]
    
    print("\n" + "="*50)
    print("LangGraph RAG System - Example Queries")
    print("="*50)
    
    for question in questions:
        print(f"\nQ: {question}")
        print("-" * 30)
        try:
            answer = rag_system.query(question)
            print(f"A: {answer}")
        except Exception as e:
            print(f"Error: {e}")
        print("\n" + "="*50)


if __name__ == "__main__":
    main()

Loading existing vectorstore from /Users/eshanjain/Desktop/ragnarok/sklearn_vectorstore.parquet
Using existing vectorstore.

LangGraph RAG System - Example Queries

Q: What is LangGraph?
------------------------------
Retrieved 3 relevant documents
A: Based on the provided context, LangGraph is a framework for building workflows and agents using LLMs (Large Language Models). Here are the key aspects of LangGraph:

1. Core Functionality:
- LangGraph allows you to build both workflows (systems where LLMs and tools are orchestrated through predefined code paths) and agents (systems where LLMs dynamically direct their own processes and tool usage)
- It provides support for common patterns in agentic systems

2. Key Benefits:
- Persistence: Supports human-in-the-loop interactions and both short-term and long-term memory
- Streaming: Provides multiple ways to stream workflow/agent outputs or intermediate state
- Deployment: Offers easy on-ramp for deployment, observability, and evaluation

3