In [16]:
%pip install --upgrade --quiet langchain langchain-community langchain-text-splitters langchain-chroma>=0.1.2 langchain-openai pypdf chromadb==0.4.14

You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import glob

DATA_PATH = '/Users/riju/Downloads/git_repo/local_rag/data/books/'

def load_documents():
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 500,
        length_function = len,
        add_start_index=True
    )
    loader = PyPDFDirectoryLoader(DATA_PATH, glob="*.pdf")
    chunks = loader.load_and_split(text_splitter=text_splitter)
    return chunks

chunks = load_documents()

In [3]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
import os
import shutil

CHROMA_PATH = "chroma"
API_KEY = '<API_KEY>'

def save_to_chroma(chunks: list[Document]):
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    embeddings = OpenAIEmbeddings(api_key= API_KEY)

    db = Chroma.from_documents(
        chunks,
        embeddings,
        persist_directory=CHROMA_PATH,  # Where to save data locally, remove if not neccesary
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}")

save_to_chroma(chunks)

Saved 35 chunks to chroma


  db.persist()


In [8]:
import argparse
# from dataclasses import dataclass
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

def query_data(query_text):
    # Create CLI.
    # parser = argparse.ArgumentParser()
    # parser.add_argument("query_text", type=str, help="The query text.")
    # args = parser.parse_args()
    # query_text = args.query_text

    # Prepare the DB.
    embedding = OpenAIEmbeddings(api_key=API_KEY)
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding)

    # Search the DB.
    results = db.similarity_search_with_relevance_scores(query_text, k=3)
    if len(results) == 0 or results[0][1] < 0.7:
        print(f"Unable to find matching results.")
        return

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = ChatOpenAI(model="gpt-4o-mini",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
        api_key=API_KEY)
    response_text = model.predict(prompt)

    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)

In [9]:
query_data("who is the writer of the book")

Response: The writer of the book is Derek Strange.
Sources: ['/Users/riju/Downloads/git_repo/local_rag/data/books/penguin_book.pdf', '/Users/riju/Downloads/git_repo/local_rag/data/books/penguin_book.pdf', '/Users/riju/Downloads/git_repo/local_rag/data/books/penguin_book.pdf']
