# Building RAG for Anyscale docs

## Initialization

In [None]:
# Imports
import os
import ray
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")
from dotenv import load_dotenv; load_dotenv()
%load_ext autoreload
%autoreload 2
from rag.config import ROOT_DIR

  from .autonotebook import tqdm as notebook_tqdm
2024-08-28 23:50:59,483	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [None]:
# Start the Ray cluster, with relevant credentials; we're not using Anyscale Endpoints.

ray.init(runtime_env={
    "env_vars": {
        "OPENAI_API_BASE": os.environ["OPENAI_API_BASE"],
        "OPENAI_API_KEY": os.environ["OPENAI_API_KEY"], 
        "DB_CONNECTION_STRING": os.environ["DB_CONNECTION_STRING"],
    },
    "working_dir": str(ROOT_DIR)
})

2024-08-28 23:50:59,900	INFO worker.py:1598 -- Connecting to existing Ray cluster at address: 10.0.62.130:6379...
2024-08-28 23:50:59,907	INFO worker.py:1774 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://session-vig1su7dbnvzqbs3l1tw35fhk3.i.anyscaleuserdata-staging.com [39m[22m
2024-08-28 23:51:00,077	INFO packaging.py:530 -- Creating a file package for local directory '/home/ray/default/anyscale-ragbot/notebooks/..'.
2024-08-28 23:51:00,313	INFO packaging.py:358 -- Pushing file package 'gcs://_ray_pkg_959490b81f9c3a2e.zip' (68.41MiB) to Ray cluster...
2024-08-28 23:51:01,135	INFO packaging.py:371 -- Successfully pushed file package 'gcs://_ray_pkg_959490b81f9c3a2e.zip'.


0,1
Python version:,3.12.2
Ray version:,2.35.0
Dashboard:,http://session-vig1su7dbnvzqbs3l1tw35fhk3.i.anyscaleuserdata-staging.com


[36m(autoscaler +15s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.
[36m(autoscaler +15s)[0m [autoscaler] Cluster upscaled to {128 CPU, 2 GPU}.
[36m(autoscaler +20m46s)[0m [autoscaler] Downscaling node i-0aa09ae6224139ae1 (node IP: 10.0.34.153) due to node idle termination.
[36m(autoscaler +20m47s)[0m [autoscaler] Downscaling node i-023b8a2dd73eaf85b (node IP: 10.0.47.129) due to node idle termination.


In [None]:
# Only text-embedding-3-large dimension and gpt-4o context length relevant for now.
from rag.config import EMBEDDING_DIMENSIONS, MAX_CONTEXT_LENGTHS

## Data

I've pre-loaded the data into `/mnt/shared_storage/emmy` for both Ray docs (`/mnt/shared_storage/emmy/docs.ray.io/en/master`) and Anyscale docs (`/mnt/shared_storage/emmy/docs.anyscale.com/docs`) respectively. So in this section, we'll clean and chunk the data.

In [None]:
from pathlib import Path
from rag.config import EFS_DIR

ANYSCALE_DOCS_DIR = Path(EFS_DIR, "docs.anyscale.com/docs")
ANYSCALE_DOCS_URL = "https://docs.anyscale.com"

In [None]:
# Create a list of dictionaries, each containing the source and text
data = []
for path in ANYSCALE_DOCS_DIR.rglob("*.md"):
    if not path.is_dir():
        with open(path, 'r', encoding='utf-8') as file:
            text = file.read()
        # Convert the file path to a URL, remove the '.md' extension
        relative_path = path.relative_to(ANYSCALE_DOCS_DIR).with_suffix('')  # Remove the '.md'
        source = f"{ANYSCALE_DOCS_URL}/{relative_path.as_posix()}"
        data.append({"source": source, "text": text})

In [None]:
anyscale_sections_ds = ray.data.from_items(data)

2024-08-28 23:51:23,362	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


## Chunking

In [None]:
from functools import partial
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter

In [None]:
sample = anyscale_sections_ds.take(1)

2024-08-28 23:10:10,815	INFO dataset.py:2409 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2024-08-28 23:10:10,818	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-08-28_16-37-38_889087_2268/logs/ray-data
2024-08-28 23:10:10,819	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> LimitOperator[limit=1]
Running 0: 0.00 row [00:00, ? row/s]

                                                                                      
✔️  Dataset execution finished in 1.46 seconds: : 1.00 row [00:01, 1.47s/ row]          

- limit=1: 0 active, 150 queued, [cpu: 0.0, objects: 7.0KB]: : 1.00 row [00:01, 1.47s/ row]


In [None]:
sample[0]

{'source': 'https://docs.anyscale.com/get-started',
 'text': 'import Admonition from \'@theme/Admonition\';\nimport Tabs from \'@theme/Tabs\';\nimport TabItem from \'@theme/TabItem\';\n\n# Get started\n\n## 1. Join Anyscale\n\n- [Sign up](https://console.anyscale.com) for an Anyscale account at [`console.anyscale.com`](https://console.anyscale.com).\n- Check your email for a magic link.\n\n## 2. Try it out in Anyscale\'s cloud\n\nLaunch an example template in Anyscale\'s fully hosted development environment to quickly learn the platform\'s capabilities. Anyscale set up the compute resources for you, so you can get started right away with on-demand GPUs. \n\nChoose from [deploying](https://console.anyscale.com/v2/template-preview/endpoints_v2) and [fine-tuning](https://console.anyscale.com/v2/template-preview/finetuning_llms_v2) large language models, scaling [batch inference](https://console.anyscale.com/v2/template-preview/batch-llm), [serving Stable Diffusion](https://console.anyscal

In [None]:
headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4"),
    ]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)

In [None]:
chunks = markdown_splitter.split_text(sample[0]['text'])

In [None]:
chunks

[Document(page_content="import Admonition from '@theme/Admonition';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';"),
 Document(metadata={'Header 1': 'Get started', 'Header 2': '1. Join Anyscale'}, page_content='# Get started  \n## 1. Join Anyscale  \n- [Sign up](https://console.anyscale.com) for an Anyscale account at [`console.anyscale.com`](https://console.anyscale.com).\n- Check your email for a magic link.'),
 Document(metadata={'Header 1': 'Get started', 'Header 2': "2. Try it out in Anyscale's cloud"}, page_content="## 2. Try it out in Anyscale's cloud  \nLaunch an example template in Anyscale's fully hosted development environment to quickly learn the platform's capabilities. Anyscale set up the compute resources for you, so you can get started right away with on-demand GPUs.  \nChoose from [deploying](https://console.anyscale.com/v2/template-preview/endpoints_v2) and [fine-tuning](https://console.anyscale.com/v2/template-preview/finetuning_llms_v2) lar

In [None]:
def chunk_md(md_doc):
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on, 
        strip_headers=False
        )
    
    chunks = markdown_splitter.split_text(md_doc["text"])
    return[{"text": chunk.page_content, "source": md_doc["source"]} for chunk in chunks]


In [None]:
sample[0]

{'source': 'https://docs.anyscale.com/get-started',
 'text': 'import Admonition from \'@theme/Admonition\';\nimport Tabs from \'@theme/Tabs\';\nimport TabItem from \'@theme/TabItem\';\n\n# Get started\n\n## 1. Join Anyscale\n\n- [Sign up](https://console.anyscale.com) for an Anyscale account at [`console.anyscale.com`](https://console.anyscale.com).\n- Check your email for a magic link.\n\n## 2. Try it out in Anyscale\'s cloud\n\nLaunch an example template in Anyscale\'s fully hosted development environment to quickly learn the platform\'s capabilities. Anyscale set up the compute resources for you, so you can get started right away with on-demand GPUs. \n\nChoose from [deploying](https://console.anyscale.com/v2/template-preview/endpoints_v2) and [fine-tuning](https://console.anyscale.com/v2/template-preview/finetuning_llms_v2) large language models, scaling [batch inference](https://console.anyscale.com/v2/template-preview/batch-llm), [serving Stable Diffusion](https://console.anyscal

In [None]:
temp = chunk_md(sample[0])

In [None]:
temp[0]

{'text': "import Admonition from '@theme/Admonition';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';",
 'source': 'https://docs.anyscale.com/get-started'}

In [None]:
chunks_ds = anyscale_sections_ds.flat_map(chunk_md)

In [None]:
chunks_ds.show(1)

2024-08-28 23:33:44,370	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-08-28_16-37-38_889087_2268/logs/ray-data
2024-08-28 23:33:44,370	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap(chunk_md)] -> LimitOperator[limit=1]
Running 0: 0.00 row [00:00, ? row/s]
[A


[A
Running: 1/0 CPU, 0/0 GPU, 256.0MB/2.1GB object_store_memory: : 0.00 row [00:01, ? row/s]
Running: 1/0 CPU, 0/0 GPU, 256.0MB/2.1GB object_store_memory: : 0.00 row [00:02, ? row/s]
Running: 1/0 CPU, 0/0 GPU, 256.0MB/2.1GB object_store_memory: : 0.00 row [00:03, ? row/s]
Running: 1/0 CPU, 0/0 GPU, 256.0MB/2.1GB object_store_memory: : 0.00 row [00:04, ? row/s]
Running: 1/0 CPU, 0/0 GPU, 256.0MB/2.1GB object_store_memory: : 0.00 row [00:05, ? row/s]
Running: 1/0 CPU, 0/0 GPU, 256.0MB/2.1GB object_store_memory: : 0.00 row [00:06, ? row/s]
Running: 1/0 CPU, 0/0 GPU, 256.0MB/2.1GB object_store_memory: : 0.00 row [00:07, ? row/s]
Running: 1/0 CPU, 0/0 GPU, 256.0MB/2.1GB object_store_memory: : 0.00 row [00:08, ? row/s]
Running: 1/0 CPU, 0/0 GPU, 256.0MB/2.1GB object_store_memory: : 0.00 row [00:09, ? row/s]
Running: 1/0 CPU, 0/0 GPU, 256.0MB/2.1GB object_store_memory: : 0.00 row [00:10, ? row/s]
Running: 1/0 CPU, 0/0 GPU, 256.0MB/2.1GB object_store_memory: : 0.00 row [00:11, ? row/s]
Runni

                                                                                         
[A                                                                                                    

Running: 1/0 CPU, 0/0 GPU, 256.0MB/2.1GB object_store_memory: : 0.00 row [00:04, ? row/s]
[A

[36m(autoscaler +23m54s)[0m [autoscaler] [48CPU-192GB] Upscaling 1 node(s).


                                                                                         
[A                                                                                                    

Running: 1/0 CPU, 0/0 GPU, 256.0MB/2.1GB object_store_memory: : 0.00 row [00:05, ? row/s]
[A

[36m(autoscaler +23m55s)[0m [autoscaler] [48CPU-192GB|m5.12xlarge] [us-west-2a] [on-demand] Launched 1 instances.
[36m(autoscaler +27m11s)[0m [autoscaler] Downscaling node i-0679e5d0b86bb5f62 (node IP: 10.0.59.88) due to node idle termination.


{'text': "import Admonition from '@theme/Admonition';\nimport Tabs from '@theme/Tabs';\nimport TabItem from '@theme/TabItem';", 'source': 'https://docs.anyscale.com/get-started'}





## Embed

To simplify, I'm just doing OpenAI across the board.

In [None]:
from langchain_openai import OpenAIEmbeddings

In [None]:
embedding_model = OpenAIEmbeddings(
            model="text-embedding-3-large",
            openai_api_base=os.environ["OPENAI_API_BASE"],
            openai_api_key=os.environ["OPENAI_API_KEY"]
            )

In [None]:
class EmbedChunks:
    def __init__(self):
        self.embedding_model = OpenAIEmbeddings(
            model="text-embedding-3-large",
            openai_api_base=os.environ["OPENAI_API_BASE"],
            openai_api_key=os.environ["OPENAI_API_KEY"]
            )
    def __call__(self, batch):
        embeddings = self.embedding_model.embed_documents(batch["text"])
        return {"text": batch["text"], "source": batch["source"], "embeddings": embeddings}

In [None]:
embedded_chunks = chunks_ds.map_batches(
    EmbedChunks,
    batch_size=100, 
    num_gpus=1,
    concurrency=1)

In [None]:
sample = embedded_chunks.take(1)
print ("embedding size:", len(sample[0]["embeddings"]))
print (sample[0]["text"])

2024-08-28 23:51:54,466	INFO dataset.py:2409 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2024-08-28 23:51:54,470	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-08-28_16-37-38_889087_2268/logs/ray-data
2024-08-28 23:51:54,471	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[FlatMap(chunk_md)] -> ActorPoolMapOperator[MapBatches(EmbedChunks)] -> LimitOperator[limit=1]
Running 0: 0.00 row [00:00, ? row/s]


[A


[A
[A

[A[A

Running: 95/128 CPU, 1/2 GPU, 23.8GB/74.1GB object_store_memory: : 0.00 row [01:00, ? row/s]
[A

Running: 94/128 CPU, 1/2 GPU, 449.6KB/74.1GB object_store_memory: : 0.00 row [01:01, ? row/s]
[A
[A

Running: 1/128 CPU, 1/2 GPU, 278.7MB/74.1GB object_store_memory: : 0.00 row [01:03, ? row/s] 
[A
[A

[A[A
[A
[A

                                                                                                     
[A                                                                                                                        

[A[A                                                                                                                                       


✔️  Dataset execution finished in 64.57 seconds: 100%|██████████| 1.00/1.00 [01:04<00:00, 64.6s/ row]

[A

[A[A
[A                                                                                                                        

[A[A                                       

embedding size: 3072
---
slug: /
---  
import Admonition from '@theme/Admonition';





## Store vectors

In [None]:
import psycopg
from pgvector.psycopg import register_vector

embedding_model_name = "text-embedding-3-large"

os.environ["MIGRATION_FP"] = f"../migrations/vector-{EMBEDDING_DIMENSIONS[embedding_model_name]}.sql"
os.environ["SQL_DUMP_FP"] = f"{EFS_DIR}/sql_dumps/{embedding_model_name.split('/')[-1]}.sql"

In [None]:
%%bash
# Set up
psql "$DB_CONNECTION_STRING" -c "DROP TABLE IF EXISTS document;"
echo $MIGRATION_FP
sudo -u postgres psql -f $MIGRATION_FP
echo $SQL_DUMP_FP

psql: error: connection to server at "localhost" (127.0.0.1), port 5432 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?


../migrations/vector-3072.sql


could not change directory to "/home/ray/default/anyscale-ragbot/notebooks": Permission denied
psql: error: connection to server on socket "/var/run/postgresql/.s.PGSQL.5432" failed: No such file or directory
	Is the server running locally and accepting connections on that socket?


/mnt/shared_storage/emmy/sql_dumps/text-embedding-3-large.sql


In [None]:
%%bash
# Drop the existing `document` table and create a new one with the schema to store embeddings. 
psql "$DB_CONNECTION_STRING" -c "DROP TABLE IF EXISTS document;"  # drop
sudo -u postgres psql -f $MIGRATION_FP  # create
psql "$DB_CONNECTION_STRING" -c "SELECT count(*) FROM document;"  # num rows

# DROP TABLE
# CREATE TABLE
#  count 
# -------
#      0
# (1 row)

In [None]:
class StoreResults:
    def __call__(self, batch):
        with psycopg.connect(os.environ["DB_CONNECTION_STRING"]) as conn:
            register_vector(conn)
            with conn.cursor() as cur:
                for text, source, embedding in zip(batch["text"], batch["source"], batch["embeddings"]):
                    cur.execute("INSERT INTO document (text, source, embedding) VALUES (%s, %s, %s)", (text, source, embedding,),)
        return {}

In [None]:
# Index data
embedded_chunks.map_batches(
    StoreResults,
    batch_size=128,
    num_cpus=1,
    compute=ActorPoolStrategy(size=6),
).materialize()

# Verify whether the embedding was stored successfully in Postgres or not.
# sudo -u postgres psql
# SELECT * FROM document LIMIT 10;

In [None]:
%%bash
# Save index
rm -rf $SQL_DUMP_FP
mkdir -p $(dirname "$SQL_DUMP_FP") && touch $SQL_DUMP_FP
sudo -u postgres pg_dump -c > $SQL_DUMP_FP  # save