# Preliminaries

In [1]:
%pip install -q ragstack-ai-langchain[knowledge-store] beautifulsoup4 markdownify python-dotenv

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.0/974.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.7/314.7 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.1/127.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

# Load the Astra Documentation into Graph Store

First, we'll crawl the DataStax documentation. LangChain includes a `SiteMapLoader` but it loads all of the pages into memory simultaneously, which makes it impossible to index larger sites from small environments (such as CoLab). So, we'll scrape the sitemap ourselves and iterate over the URLs, allowing us to process documents in batches and flush them to Astra DB. 

## Scrape the URLs from the Site Maps
First, we use Beautiful Soup to parse the XML content of each sitemap and get the list of URLs.
We also add a few extra URLs for external sites that are also useful to include in the index.

In [3]:
# Use sitemaps to crawl the content
SITEMAPS = [
    "https://docs.datastax.com/en/sitemap-astra-db-vector.xml",
    "https://docs.datastax.com/en/sitemap-cql.xml",
    "https://docs.datastax.com/en/sitemap-dev-app-drivers.xml",
    "https://docs.datastax.com/en/sitemap-glossary.xml",
    "https://docs.datastax.com/en/sitemap-astra-db-serverless.xml",
]

# Additional URLs to crawl for content.
EXTRA_URLS = ["https://github.com/jbellis/jvector"]

SITE_PREFIX = "astra"

from bs4 import BeautifulSoup
import requests


def load_pages(sitemap_url):
    r = requests.get(
        sitemap_url,
        headers={
            # Astra docs only return a sitemap with a user agent set.
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0",
        },
    )
    xml = r.text

    soup = BeautifulSoup(xml, features="xml")
    url_tags = soup.find_all("url")
    for url in url_tags:
        yield (url.find("loc").text)


# For maintenance purposes, we could check only the new articles since a given time.
URLS = [url for sitemap_url in SITEMAPS for url in load_pages(sitemap_url)] + EXTRA_URLS
len(URLS)

1373

## Load the content from each URL
Next, we create the code to load each page. This performs the following steps:

1. Parses the HTML with BeautifulSoup
2. Locates the "content" of the HTML using an appropriate selector based on the URL
3. Find the link (`<a href="...">`) tags in the content and collect the absolute URLs (for creating edges).

Adding the URLs of these references to the metadata allows the graph store to create edges between the document.

In [4]:
from langchain_community.document_loaders import AsyncHtmlLoader
from bs4 import BeautifulSoup
from langchain_core.documents import Document
from typing import AsyncIterator, Iterable
from ragstack_knowledge_store.graph_store import CONTENT_ID
from markdownify import MarkdownConverter
from ragstack_langchain.graph_store.extractors import HtmlLinkEdgeExtractor

markdown_converter = MarkdownConverter(heading_style="ATX")
html_link_extractor = HtmlLinkEdgeExtractor()


def select_content(soup: BeautifulSoup, url: str) -> BeautifulSoup:
    if url.startswith("https://docs.datastax.com/en/"):
        return soup.select_one("article.doc")
    elif url.startswith("https://github.com"):
        return soup.select_one("article.entry-content")
    else:
        return soup


async def load_pages(urls: Iterable[str]) -> AsyncIterator[Document]:
    loader = AsyncHtmlLoader(
        urls,
        requests_per_second=4,
        # Astra docs require a user agent
        header_template={
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0"
        },
    )
    async for html in loader.alazy_load():
        url = html.metadata["source"]

        # Use the URL as the content ID.
        html.metadata[CONTENT_ID] = url

        # Apply the selectors while loading. This reduces the size of
        # the document as early as possible for reduced memory usage.
        soup = BeautifulSoup(html.page_content, "html.parser")
        content = select_content(soup, url)

        # Extract HTML links from the content.
        html_link_extractor.extract_one(html, content)

        # Convert the content to markdown
        html.page_content = markdown_converter.convert_soup(content)

        yield html

: 

## Initialize Environment
Before we initialize the Graph Store and write the documents we need to set some environment variables.
In colab, this will prompt you for input. When running locally, this will load from `.env`.

In [5]:
import os

if os.getenv("COLAB_RELEASE_TAG"):
    # (Option 1) - Set the environment variables from getpass.
    print("In colab. Using getpass/input for environment variables.")
    import getpass
    import os

    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")
    os.environ["ASTRA_DB_DATABASE_ID"] = input("Enter Astra DB Database ID: ")
    os.environ["ASTRA_DB_APPLICATION_TOKEN"] = getpass.getpass(
        "Enter Astra DB Application Token: "
    )

    keyspace = input("Enter Astra DB Keyspace (Empty for default): ")
    if keyspace:
        os.environ["ASTRA_DB_KEYSPACE"] = keyspace
    else:
        os.environ.pop("ASTRA_DB_KEYSPACE", None)
else:
    print("Not in colab. Loading '.env' (see 'env.template' for example)")
    import dotenv

    dotenv.load_dotenv()

Not in colab. Loading '.env' (see 'env.template' for example)


## Initialize Cassio and Graph Store
With the environment variables set we initialize the Cassio library for talking to Cassandra / Astra DB.
We also create the `GraphStore`.

In [6]:
SITE_PREFIX = "astra_docs"

In [7]:
answer = input("Drop Tables? [(Y)es/(N)o]")
if answer.lower() in ["y", "yes"]:
    import cassio

    cassio.init(auto=True)
    from cassio.config import check_resolve_session, check_resolve_keyspace

    session = check_resolve_session()
    keyspace = check_resolve_keyspace()
    session.execute(f"DROP TABLE IF EXISTS {keyspace}.{SITE_PREFIX}_nodes")
    session.execute(f"DROP TABLE IF EXISTS {keyspace}.{SITE_PREFIX}_targets")
else:
    # Handle no / "wrong" input
    pass

ERROR:cassandra.connection:Closing connection <LibevConnection(4414837392) 4f913ede-cf76-4f98-b390-907743bafd85-us-east1.db.astra.datastax.com:29042:405388e3-1a02-4604-87ca-d3c869677703> due to protocol error: Error from server: code=000a [Protocol error] message="Beta version of the protocol used (5/v5-beta), but USE_BETA flag is unset"


In [8]:
import cassio
from langchain_openai import OpenAIEmbeddings
from ragstack_langchain.graph_store import CassandraGraphStore

cassio.init(auto=True)
embeddings = OpenAIEmbeddings()
graph_store = CassandraGraphStore(
    embeddings, node_table=f"{SITE_PREFIX}_nodes", edge_table=f"{SITE_PREFIX}_edges"
)



ERROR:cassandra.connection:Closing connection <LibevConnection(5131913040) 4f913ede-cf76-4f98-b390-907743bafd85-us-east1.db.astra.datastax.com:29042:405388e3-1a02-4604-87ca-d3c869677703> due to protocol error: Error from server: code=000a [Protocol error] message="Beta version of the protocol used (5/v5-beta), but USE_BETA flag is unset"


## Load the Documents
Finally, we fetch pages and write them to the graph store in batches of 50.

In [9]:
not_found = 0
found = 0

docs = []
async for doc in load_pages(URLS):
    if doc.page_content.startswith("\n# Page Not Found"):
        not_found += 1
        continue

    docs.append(doc)
    found += 1

    if len(docs) >= 50:
        graph_store.add_documents(docs)
        docs.clear()

if docs:
    graph_store.add_documents(docs)
print(f"{not_found} (of {not_found + found}) URLs were not found")

Fetching pages: 100%|##########| 1373/1373 [03:46<00:00,  6.06it/s]


96 (of 1373) URLs were not found


# Create and execute the RAG Chains

In [10]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

llm = ChatOpenAI(model="gpt-4o")

template = """You are a helpful technical support bot. You should provide complete answers explaining the options the user has available to address their problem. Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)


def format_docs(docs):
    formatted = "\n\n".join(
        f"From {doc.metadata['content_id']}: {doc.page_content}" for doc in docs
    )
    return formatted

We'll use the following question. This is an interesting question because the ideal answer should be concise and in-depth, based on how the vector indexing is actually implemented.

In [11]:
QUESTION = "What vector indexing algorithms does Astra use?"

In [12]:
from IPython.display import display, Markdown


# Helper method to render markdown in responses to a chain.
def run_and_render(chain, question):
    result = chain.invoke(question)
    display(Markdown(result))

## Vector-Only Retrieval

In [13]:
# Depth 0 doesn't traverses edges and is equivalent to vector similarity only.
vector_retriever = graph_store.as_retriever(search_kwargs={"depth": 0})

vector_rag_chain = (
    {"context": vector_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [14]:
run_and_render(vector_rag_chain, QUESTION)

Astra DB Serverless uses multiple indexing techniques to speed up vector searches. The primary vector indexing algorithms and techniques include:

1. **JVector**:
   - **Description**: The Serverless (Vector) database uses the JVector vector search engine to construct a graph index.
   - **Features**: JVector adds new documents to the graph immediately, allowing for efficient searches right away. It can also compress vectors with quantization to save space and improve performance.
   - **Further Information**: [JVector on GitHub](https://github.com/jbellis/jvector)

2. **Storage-Attached Index (SAI)**:
   - **Description**: SAI is an indexing technique used to efficiently find rows that satisfy query predicates. Astra DB provides numeric-, text-, and vector-based indexes to support different kinds of searches.
   - **Features**: SAI can be customized based on specific requirements, such as a particular similarity function or text transformation. When a search is run, SAI loads a superset of possible results from storage based on the provided predicates and sorts the results by vector similarity. The top `limit` results are then returned to the user.
   - **Further Information**: [Storage-Attached Indexing (SAI) Overview](https://docs.datastax.com/en/cql/astra/developing/indexing/sai/sai-overview.html)

These indexing techniques are designed to make vector searches faster and more efficient, supporting a variety of use cases such as semantic search, AI applications, and more.

## Graph Traversal Retrieval

In [15]:
# Depth 1 does vector similarity and then traverses 1 level of edges.
graph_retriever = graph_store.as_retriever(search_kwargs={"depth": 1})

graph_rag_chain = (
    {"context": graph_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [16]:
run_and_render(graph_rag_chain, QUESTION)

Astra DB Serverless utilizes the JVector vector search engine to construct a graph index for vector databases. The JVector engine is part of the DiskANN family of graph-based index algorithms, which are known for their efficiency and scalability in high-dimensional spaces. 

### Key Features of JVector:
1. **Graph-Based Index**: JVector constructs a single-layer graph with nonblocking concurrency control, allowing the index to scale linearly with the number of cores.
2. **Two-Pass Search**: The search process in JVector includes a first pass powered by lossily compressed representations of the vectors kept in memory and a second pass using a more accurate representation read from disk.
3. **Compression Techniques**:
   - **Product Quantization (PQ)**: Used to compress vectors for efficient storage and search.
   - **Binary Quantization (BQ)**: Another method of compression, though generally less useful than PQ due to its impact on search accuracy.
   - **Fused ADC**: An advanced method where PQ codebooks are transposed and written inline with the graph adjacency list, improving the efficiency of the search.

### Why Graph-Based Indexing?
Graph-based indexes, such as JVector, are preferred for several reasons:
- **Incremental Construction and Updates**: Unlike partition-based indexes, graph-based indexes can be constructed and updated incrementally, making them suitable for dynamic datasets.
- **Performance**: Graph-based indexes tend to be faster and simpler to implement.
- **Scalability**: They support large-scale datasets and can handle high-dimensional vector spaces efficiently.

### Comparison to Other ANN Indexes:
- **Partition-Based Indexes**: Examples include LSH (Locality-Sensitive Hashing) and IVF (Inverted File Index). These are typically better suited for static datasets and might not perform as well as graph-based indexes in dynamic environments.
- **Graph-Based Indexes**: Examples include HNSW (Hierarchical Navigable Small World) and DiskANN. These indexes are favored for their incremental update capabilities and overall efficiency in high-dimensional spaces.

### Conclusion:
Astra DB Serverless leverages JVector, a sophisticated graph-based index, to provide efficient and scalable vector search capabilities. This choice aligns with the needs of modern applications that require real-time updates and high performance in handling large, high-dimensional datasets.

## MMR Graph Traversal

In [17]:
mmr_graph_retriever = graph_store.as_retriever(
    search_type="mmr_traversal",
    search_kwargs={
        "k": 4,
        "fetch_k": 10,
        "depth": 2,
        # "score_threshold": 0.2,
    },
)

mmr_graph_rag_chain = (
    {"context": mmr_graph_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [18]:
run_and_render(mmr_graph_rag_chain, QUESTION)

Astra DB Serverless uses multiple vector indexing techniques to speed up searches. Specifically, it uses:

1. **JVector**: Astra DB utilizes the JVector vector search engine to construct a graph index. JVector is a graph-based index that builds on the DiskANN design. It constructs a single-layer graph with nonblocking concurrency control, which allows for scalable construction with the number of cores. JVector can also compress vectors using techniques like product quantization to save space and improve performance.

2. **Storage-Attached Index (SAI)**: SAI is another indexing technique available in Astra DB to efficiently find rows that satisfy query predicates. It supports numeric-, text-, and vector-based indexes which can be customized based on specific requirements. When running a search, SAI loads a superset of all possible results from storage based on the predicates provided and evaluates the search criteria, sorting the results by vector similarity before returning the top results.

These indexing methods are designed to optimize the performance and efficiency of vector searches within Astra DB Serverless.

## Check Retrieval Results

In [22]:
# Set the question and see what documents each technique retrieves.
for i, doc in enumerate(vector_retriever.invoke(QUESTION)):
    print(f"Vector [{i}]:    {doc.metadata['content_id']}")

for i, doc in enumerate(graph_retriever.invoke(QUESTION)):
    print(f"Graph [{i}]:     {doc.metadata['content_id']}")

for i, doc in enumerate(mmr_graph_retriever.invoke(QUESTION)):
    print(f"MMR Graph [{i}]: {doc.metadata['content_id']}")

Vector [0]:    https://docs.datastax.com/en/astra-db-serverless/get-started/concepts.html
Vector [1]:    https://docs.datastax.com/en/cql/astra/getting-started/vector-search-quickstart.html
Vector [2]:    https://docs.datastax.com/en/cql/astra/developing/indexing/indexing-concepts.html
Vector [3]:    https://docs.datastax.com/en/astra-db-serverless/databases/database-overview.html
Vector took 0.4749s
Graph [0]:     https://docs.datastax.com/en/astra-db-serverless/get-started/concepts.html
Graph [1]:     https://docs.datastax.com/en/cql/astra/getting-started/vector-search-quickstart.html
Graph [2]:     https://docs.datastax.com/en/cql/astra/developing/indexing/indexing-concepts.html
Graph [3]:     https://docs.datastax.com/en/astra-db-serverless/databases/database-overview.html
Graph [4]:     https://docs.datastax.com/en/glossary/index.html
Graph [5]:     https://docs.datastax.com/en/astra-db-serverless/administration/maintenance-schedule.html
Graph [6]:     https://docs.datastax.com/en

# Conclusion
With vector only we retrieved chunks from the Astra documentation explaining that it used JVector.
Since it didn't follow the link to [JVector on GitHub](https://github.com/jbellis/jvector) it didn't actually answer the question.

The graph retrieval started with the same set of chunks, but it followed the edge to the documents we loaded from GitHub.
This allowed the LLM to read in more depth how JVector is implemented, which allowed it to answer the question more clearly and with more detail.