In [3]:
import openai
import os
import sys

In [4]:
# Constants and setup
openai.api_key = os.environ["OPENAI_API_KEY"]

PDF_PATH = "./data/Fluent Python.pdf"
# PDF_PATH = "./data/Python summary.pdf"

## Document loader
- Load the book.

- Add chapter title to metadata.

In [5]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(PDF_PATH)
pages = loader.load_and_split()

In [4]:
len(pages)

65

## Splitting

In [6]:
# How to split it well? How to do that in the final app?
from langchain.text_splitter import RecursiveCharacterTextSplitter
CHUNK_SIZE = 1500
CHUNK_OVERLAP = 150


splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

In [7]:
docs = splitter.split_documents(pages)

In [11]:
docs[-200]

Document(page_content='5The expression self.__data[name]  is where a KeyError  exception may occur. Ideally, it should be handled\nand an AttributeError  raised instead, because that’s what is expected from __getattr__ . The diligent reader\nis invited to code the error handling as an exercise.\n6The source of the data is JSON, and the only collection types in JSON data are dict  and list .Otherwise, fetch the item with the key name  from self.__data , and return the\nresult of calling FrozenJSON.build()  on that.5\nImplementing __dir__  suports the dir()  built-in, which in turns supports auto-\ncompletion in the standard Python console as well as IPython, Jupyter Notebook,\netc. This simple code will enable recursive auto-completion based on the keys in\nself.__data , because __getattr__  builds FrozenJSON  instances on the fly—use‐\nful for interactive exploration of the data.\nThis is an alternate constructor, a common use for the @classmethod  decorator.\nIf obj is a mapping, buil

## Storage

[] Add chapter name to metadata

In [8]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embedding = OpenAIEmbeddings()
# Splitting format: chroma_<chunk_size>_<chunk_overlap>
persist_directory = f"data/chroma_{CHUNK_SIZE}_{CHUNK_OVERLAP}"

vectordb = Chroma.from_documents(documents=docs, 
                                 embedding=embedding, 
                                 persist_directory=persist_directory
                                 )
vectordb.persist()

In [43]:
print(vectordb._collection.count())

25815


There are some issues with retrieving results from this with similarity search. 
1) Duplicates can be there
2) Context is missing. 

In [12]:
question = "What about dataclasses?"
docs = vectordb.similarity_search(question,k=3)

## Retrieval

There are different types of search, such as `similarity_search`, `max_marginal_relevance_search`, etc.

In [15]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

persist_directory = 'data/chroma'

# Load embeddings from database
embedding = OpenAIEmbeddings()
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

In [19]:
# Using metadata
docs = vectordb.similarity_search(
    question,
    k=5,
    # filter={""}
)

In [45]:
docs[1000].metadata

{'source': './data/Fluent Python.pdf', 'page': 60}

## Output

In [24]:
from langchain.chat_models import ChatOpenAI

llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=llm_name, temperature=0)

In [46]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)


# result = qa_chain({"query": question})
# result["result"]


'Yes, the book does discuss testing with Pytest.'

In [47]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible. Check in if the user understood the answer if appropriate. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [48]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [56]:
question = "Why are threads an inefficient model for concurrency?"

result = qa_chain({"query": question})
result["result"]


'Threads are an inefficient model for concurrency because they make code hard to reason about and lack constraints, which can lead to difficulty in managing large-scale concurrency.'