Example of Building RocksDB Knowledge Base using LangChain 

In [None]:
GITHUB_REPO="facebook/rocksdb"
GITHUB_TOKEN=""

In [3]:
import requests, time

#Crawls a GitHub repository and returns a list of all ipynb files in the repository
def crawl_github_repo(url,is_sub_dir,access_token = f"{GITHUB_TOKEN}"):

    ignore_list = ['__init__.py']

    if not is_sub_dir:
        api_url = f"https://api.github.com/repos/{url}/contents"
    else:
        api_url = url

    headers = {
        "Accept": "application/vnd.github.v3+json",
        "Authorization": f"Bearer {access_token}" 
                   }

    response = requests.get(api_url, headers=headers)
    response.raise_for_status()  # Check for any request errors

    files = []

    contents = response.json()

    for item in contents:
        if item['type'] == 'file' and item['name'] not in ignore_list and (item['name'].endswith('.h') or item['name'].endswith('.cc')):
            files.append(item['html_url'])
        elif item['type'] == 'dir' and not item['name'].startswith("."):
            sub_files = crawl_github_repo(item['url'],True)
            time.sleep(.1)
            files.extend(sub_files)

    return files

In [4]:
code_files_urls = crawl_github_repo(GITHUB_REPO,False,GITHUB_TOKEN)

# Write list to a file so you do not have to download each time
with open('/home/nando/code_files_urls.txt', 'w') as f:
    for item in code_files_urls:
        f.write(item + '\n')

In [5]:
import requests
# import nbformat
import json


# Extracts the python code from an .py file from github
def extract_code(github_url):
    raw_url = github_url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")

    response = requests.get(raw_url)
    response.raise_for_status()  # Check for any request errors

    code = response.text

    return code

with open('/home/nando/code_files_urls.txt') as f:
    code_files_urls = f.read().splitlines()

In [7]:
with open('/home/nando/code_files_urls.txt') as f:
    code_files_urls = f.read().splitlines()

In [8]:
code_files_urls

['https://github.com/facebook/rocksdb/blob/main/options/cf_options.cc',
 'https://github.com/facebook/rocksdb/blob/main/options/cf_options.h',
 'https://github.com/facebook/rocksdb/blob/main/options/configurable.cc',
 'https://github.com/facebook/rocksdb/blob/main/options/configurable_helper.h',
 'https://github.com/facebook/rocksdb/blob/main/options/configurable_test.cc',
 'https://github.com/facebook/rocksdb/blob/main/options/configurable_test.h',
 'https://github.com/facebook/rocksdb/blob/main/options/customizable.cc',
 'https://github.com/facebook/rocksdb/blob/main/options/customizable_test.cc',
 'https://github.com/facebook/rocksdb/blob/main/options/db_options.cc',
 'https://github.com/facebook/rocksdb/blob/main/options/db_options.h',
 'https://github.com/facebook/rocksdb/blob/main/options/offpeak_time_info.cc',
 'https://github.com/facebook/rocksdb/blob/main/options/offpeak_time_info.h',
 'https://github.com/facebook/rocksdb/blob/main/options/options.cc',
 'https://github.com/fac

In [9]:
from langchain.schema.document import Document
import requests
from requests.exceptions import HTTPError
code_strings = []

for i in range(len(code_files_urls)):
    try:
        # Assuming extract_code handles the URL fetching
        content = extract_code(code_files_urls[i])
        doc = Document(page_content=content, metadata={"url": code_files_urls[i], "file_index": i})
        code_strings.append(doc)
    except HTTPError as e:
        if e.response.status_code == 404:
            print(f"Skipping file at {code_files_urls[i]} due to 404 error.")
        else:
            raise  # Re-raise the exception if it's not a 404

# Process the successfully fetched code strings
code_strings[0]

Document(metadata={'url': 'https://github.com/facebook/rocksdb/blob/main/options/cf_options.cc', 'file_index': 0}, page_content='//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.\n//  This source code is licensed under both the GPLv2 (found in the\n//  COPYING file in the root directory) and Apache 2.0 License\n//  (found in the LICENSE.Apache file in the root directory).\n\n#include "options/cf_options.h"\n\n#include <cassert>\n#include <cinttypes>\n#include <limits>\n#include <string>\n\n#include "logging/logging.h"\n#include "options/configurable_helper.h"\n#include "options/db_options.h"\n#include "options/options_helper.h"\n#include "options/options_parser.h"\n#include "port/port.h"\n#include "rocksdb/advanced_cache.h"\n#include "rocksdb/compaction_filter.h"\n#include "rocksdb/concurrent_task_limiter.h"\n#include "rocksdb/configurable.h"\n#include "rocksdb/convenience.h"\n#include "rocksdb/env.h"\n#include "rocksdb/file_system.h"\n#include "rocksdb/merge_operato

In [25]:
len(code_strings)

21

In [23]:
code_strings

[Document(metadata={'url': 'https://github.com/facebook/rocksdb/blob/main/options/cf_options.cc', 'file_index': 0}, page_content='//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.\n//  This source code is licensed under both the GPLv2 (found in the\n//  COPYING file in the root directory) and Apache 2.0 License\n//  (found in the LICENSE.Apache file in the root directory).\n\n#include "options/cf_options.h"\n\n#include <cassert>\n#include <cinttypes>\n#include <limits>\n#include <string>\n\n#include "logging/logging.h"\n#include "options/configurable_helper.h"\n#include "options/db_options.h"\n#include "options/options_helper.h"\n#include "options/options_parser.h"\n#include "port/port.h"\n#include "rocksdb/advanced_cache.h"\n#include "rocksdb/compaction_filter.h"\n#include "rocksdb/concurrent_task_limiter.h"\n#include "rocksdb/configurable.h"\n#include "rocksdb/convenience.h"\n#include "rocksdb/env.h"\n#include "rocksdb/file_system.h"\n#include "rocksdb/merge_operat

In [26]:
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import Language

# Chunk code strings
text_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.CPP,  # Use CPP for .cc or .h files
    # chunk_size=2000,
    # chunk_overlap=200
)
texts = text_splitter.split_documents(code_strings)


In [28]:
len(texts)

371

In [27]:
texts

[Document(metadata={'url': 'https://github.com/facebook/rocksdb/blob/main/options/cf_options.cc', 'file_index': 0}, page_content='//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.\n//  This source code is licensed under both the GPLv2 (found in the\n//  COPYING file in the root directory) and Apache 2.0 License\n//  (found in the LICENSE.Apache file in the root directory).\n\n#include "options/cf_options.h"\n\n#include <cassert>\n#include <cinttypes>\n#include <limits>\n#include <string>\n\n#include "logging/logging.h"\n#include "options/configurable_helper.h"\n#include "options/db_options.h"\n#include "options/options_helper.h"\n#include "options/options_parser.h"\n#include "port/port.h"\n#include "rocksdb/advanced_cache.h"\n#include "rocksdb/compaction_filter.h"\n#include "rocksdb/concurrent_task_limiter.h"\n#include "rocksdb/configurable.h"\n#include "rocksdb/convenience.h"\n#include "rocksdb/env.h"\n#include "rocksdb/file_system.h"\n#include "rocksdb/merge_operat

In [31]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

loader = DirectoryLoader('rocksdb-wiki', glob="./*.md", show_progress=True, loader_cls=UnstructuredMarkdownLoader)
wiki_documents = loader.load()


  1%|          | 1/166 [02:12<6:05:34, 132.93s/it]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 166/166 [01:51<00:00,  1.49it/s]


In [38]:
wiki_documents

[Document(metadata={'source': 'rocksdb-wiki/Talks.md'}, page_content='2022 RocksDB Meetup at Meta Headquarters\n\n01:35 Kickoff/Introduction\n\n05:20 XDP-Rocks: a Hardware-Accelerated RocksDB by Pliops\n\n25:50 Speedb: Memory Management as a Stepping Stone to Auto-Tuning\n\n1:01:00 Evolution of RocksDB at Nutanix\n\n1:12:05 RocksDB Vision\n\n1:33:55 DBtune : Database Tuning Software as a Service\n\n1:45:26 Hyper-Clock Cache\n\n1:55:10 Time Aware Tiered Storage\n\n2:03:37 Parallel Iterator IO\n\n2:14:15 Range Deletion Performance Improvement\n\nRocksDB meetup - August 2021\n\nHow Rockset built a Clustered Search Index using RocksDB (Rockset)\n\nRolling with RocksDB (Miovision)\n\nRibbon Filter Update (Facebook)\n\nRocksDB on Disaggregated Storage (Facebook)\n\nRocksDB Meetup for October 2020 featuring Facebook, VMware, and Rockset\n\nRocksDB Meetup 2020 at Rockset\n\nStateful Stream Processing with Samza and RocksDB by Abhishek Shivanna and Ray Matharu, both from LinkedIn\n\nCharacteriz

In [33]:
all_sections = []
headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3"), ("####", "Header 4")]
text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
for doc in wiki_documents:
    sections = text_splitter.split_text(doc.page_content)
    all_sections.extend(sections)

In [35]:
len(texts)

371

In [36]:
len(all_sections)

166

In [37]:
final_docs = texts + all_sections
final_docs

[Document(metadata={'url': 'https://github.com/facebook/rocksdb/blob/main/options/cf_options.cc', 'file_index': 0}, page_content='//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.\n//  This source code is licensed under both the GPLv2 (found in the\n//  COPYING file in the root directory) and Apache 2.0 License\n//  (found in the LICENSE.Apache file in the root directory).\n\n#include "options/cf_options.h"\n\n#include <cassert>\n#include <cinttypes>\n#include <limits>\n#include <string>\n\n#include "logging/logging.h"\n#include "options/configurable_helper.h"\n#include "options/db_options.h"\n#include "options/options_helper.h"\n#include "options/options_parser.h"\n#include "port/port.h"\n#include "rocksdb/advanced_cache.h"\n#include "rocksdb/compaction_filter.h"\n#include "rocksdb/concurrent_task_limiter.h"\n#include "rocksdb/configurable.h"\n#include "rocksdb/convenience.h"\n#include "rocksdb/env.h"\n#include "rocksdb/file_system.h"\n#include "rocksdb/merge_operat

In [39]:
len(final_docs)

537

In [44]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
import time

EMBEDDING_QPM = 100  # Queries per minute allowed by your rate limit
EMBEDDING_NUM_BATCH = 5  # Number of batches
MODEL_NAME = "text-embedding-3-small"
SLEEP_TIME = 60  # One minute sleep to respect QPM limits

embeddings = OpenAIEmbeddings(model=MODEL_NAME)

# Calculate the appropriate batch size based on allowed QPM
batch_size = len(final_docs) // EMBEDDING_NUM_BATCH

# Process in smaller batches with pauses to avoid rate limits
db = None
for i in range(0, len(final_docs), batch_size):
    batch = final_docs[i:i + batch_size]
    if db is None:
        db = FAISS.from_documents(batch, embeddings)
    else:
        db.add_documents(batch)  # Corrected to pass only the batch
    time.sleep(SLEEP_TIME)  # Sleep to ensure we do not exceed rate limit


In [42]:
db.save_local("db")

In [14]:

db = FAISS.load_local("db", embeddings=embeddings, allow_dangerous_deserialization=True)
db

<langchain_community.vectorstores.faiss.FAISS at 0x747dfffe3640>

In [15]:
# Init your retriever.
retriever = db.as_retriever()

In [16]:
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("user", "{input}")
    ]
)
print(prompt)

input_variables=['context', 'input'] messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.\n\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}'))]


In [17]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, model="gpt-4o", max_tokens=1000)

In [18]:
from langchain.chains.combine_documents import create_stuff_documents_chain

qna = create_stuff_documents_chain(llm, prompt)
qna

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), config={'run_name': 'format_inputs'})
| ChatPromptTemplate(input_variables=['context', 'input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.\n\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}'))])
| ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x747deff28580>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x747dfd524c70>, root_client=<openai.OpenAI object at 0x747dfd5403a0>, root_async_client=<openai.AsyncOpenAI object at 0x747deff28520>, model_name='gpt-4o', temperature=0.0, o

In [19]:
from langchain.chains import create_retrieval_chain

rag = create_retrieval_chain(retriever, qna)
rag

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x747dfffe3640>), config={'run_name': 'retrieve_documents'})
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), config={'run_name': 'format_inputs'})
            | ChatPromptTemplate(input_variables=['context', 'input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise.\n\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(in

In [41]:
response = rag.invoke({"input": "How to do basic tuning in RocksDB"})
response["answer"]

'To perform basic tuning in RocksDB, you can adjust several key options:\n\n1. **Write Buffer Size**: Increase `write_buffer_size` to reduce the frequency of memtable flushes.\n2. **Max Background Jobs**: Set `max_background_jobs` to a higher value to allow more background compactions and flushes.\n3. **Max Open Files**: Increase `max_open_files` to avoid frequent file opening and closing.\n4. **Block Cache Size**: Adjust `block_cache_size` to optimize memory usage for caching data blocks.\n5. **Compaction Settings**: Tune `max_compaction_bytes` and `disable_auto_compactions` to control compaction behavior.\n\nThese adjustments can help improve performance based on your specific workload and hardware configuration.'