In [5]:
pip install langchain langchain-community unstructured unstructured[pdf] openai chromadb tiktoken langchain-cli pytest langchain-openai

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain
  Using cached langchain-0.2.1-py3-none-any.whl (973 kB)
Collecting langchain-community
  Using cached langchain_community-0.2.1-py3-none-any.whl (2.1 MB)
Collecting numpy<2,>=1
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (14.2 MB)
Collecting pydantic<3,>=1
  Using cached pydantic-2.7.3-py3-none-any.whl (409 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0
  Using cached langchain_text_splitters-0.2.0-py3-none-any.whl (23 kB)
Collecting PyYAML>=5.3
  Using cached PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (677 kB)
Collecting SQLAlchemy<3,>=1.4
  Using cached SQLAlchemy-2.0.30-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (3.1 MB)
Collecting aiohttp<4.0.0,>=3.8.3
  Using cached aiohttp-3.9.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (1.2 MB)
Collecting tenacity<9.0.0,>=8.1.0
 

In [4]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key:")

In [None]:
!python 

In [32]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate
import os
import shutil

CHROMA_PATH = "chroma"
DATA_PATH = "data/books"
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""


def main():
    generate_data_store()

    query_text = "Why humans run the world?"
    embedding_function = OpenAIEmbeddings()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    print(query_text)
    results = db.similarity_search_with_relevance_scores(query_text, k=3)
    print(results[0][1])
    if len(results) == 0 or results[0][1] < 0.7:
        print("Please ask a question related to the PDF file!")
        return

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    print(prompt)

    model = ChatOpenAI()
    response_text = model.predict(prompt)

    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)


def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)


def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.pdf")
    documents = loader.load()
    return documents


def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks


def save_to_chroma(chunks: list[Document]):
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")


if __name__ == "__main__":
    main()


Split 1 documents into 298 chunks.
18 A Permanent Revolution 19 And They Lived Happily Ever After 20 The End of Homo Sapiens

Afterword: The Animal that Became a God

Notes Acknowledgements Image credits

Timeline of History

Years

Before

the

Present

13.5

Matter and energy appear. Beginning of physics. Atoms and molecules
{'source': 'data/books/index.pdf', 'start_index': 2057}


OperationalError: attempt to write a readonly database

In [33]:
!pip freeze

aiohttp==3.9.5
aiosignal==1.3.1
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.4.0
asgiref==3.8.1
asttokens==2.4.1
async-timeout==4.0.3
attrs==23.2.0
backoff==2.2.1
bcrypt==4.1.3
beautifulsoup4==4.12.3
build==1.2.1
cachetools==5.3.3
certifi==2024.6.2
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
chroma-hnswlib==0.7.3
chromadb==0.5.0
click==8.1.7
colorama==0.4.6
coloredlogs==15.0.1
comm==0.2.2
contourpy==1.2.1
cryptography==42.0.7
cycler==0.12.1
dataclasses-json==0.6.6
debugpy==1.8.1
decorator==5.1.1
deepdiff==7.0.1
Deprecated==1.2.14
distro==1.9.0
effdet==0.4.1
emoji==2.12.1
exceptiongroup==1.2.1
executing==2.0.1
fastapi==0.110.3
filelock==3.14.0
filetype==1.2.0
flatbuffers==24.3.25
fonttools==4.53.0
frozenlist==1.4.1
fsspec==2024.6.0
gitdb==4.0.11
GitPython==3.1.41
google-api-core==2.19.0
google-auth==2.29.0
google-cloud-vision==3.7.2
googleapis-common-protos==1.63.1
greenlet==3.0.3
grpcio==1.64.1
grpcio-status==1.62.2
h11==0.14.0
httpcore==1.0.5
httptools==0.6.

In [22]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.evaluation import load_evaluator


evaluator = load_evaluator("pairwise_embedding_distance")

words = ("apple", "apple")
x = evaluator.evaluate_string_pairs(prediction=words[0], prediction_b=words[1])
print(f"Comparing ({words[0]}, {words[1]}): {x}")

words = ("apple", "iphone")
x = evaluator.evaluate_string_pairs(prediction=words[0], prediction_b=words[1])
print(f"Comparing ({words[0]}, {words[1]}): {x}")

words = ("apple", "banana")
x = evaluator.evaluate_string_pairs(prediction=words[0], prediction_b=words[1])
print(f"Comparing ({words[0]}, {words[1]}): {x}")

Comparing (apple, apple): {'score': -1.1102230246251565e-15}
Comparing (apple, iphone): {'score': 0.09710853291781563}
Comparing (apple, banana): {'score': 0.09725941975023544}


In [28]:
query_text = "What is money?"
embedding_function = OpenAIEmbeddings()
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
results = db.similarity_search_with_relevance_scores(query_text, k=3)
if len(results) == 0 or results[0][1] < 0.7:
    print("Please ask a question related to the PDF file!")
print(results)

context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
print(prompt)

model = ChatOpenAI()
response_text = model.predict(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Please ask a question related to the PDF file!
[]
Human: 
You are an intelligent assistant. You have been provided with the following context extracted from a PDF document:



Based on this context, please provide a simple answer to the following question:

What is money?

Response: Money is a medium of exchange used to facilitate transactions and represent value.
Sources: []


In [31]:
import os
import shutil
import sys

from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate

CHROMA_PATH = "chroma"
DATA_PATH = "data/books"
PROMPT_TEMPLATE = """
You are an intelligent assistant. You have been provided with the following context extracted from a PDF document:

{context}

Based on this context, please provide an answer to the following question:

{question}
"""


def main():
    pdf_file = "index.pdf"
    documents = load_documents(pdf_file)
    chunks = split_text(documents)
    save_to_chroma(chunks)

    query_text = "Why humans run the world?"
    embedding_function = OpenAIEmbeddings()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
    results = db.similarity_search_with_relevance_scores(query_text, k=3)
    if len(results) == 0 or results[0][1] < 0.7:
        print("Please ask a question related to the PDF file!")
        return


    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    print(prompt)

    model = ChatOpenAI()
    response_text = model.predict(prompt)

    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)


def load_documents(file):
    loader = DirectoryLoader(DATA_PATH, glob=file)
    documents = loader.load()
    return documents


def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks


def save_to_chroma(chunks: list[Document]):
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    Chroma.from_documents(chunks, OpenAIEmbeddings())


if __name__ == "__main__":
    main()


Split 1 documents into 220 chunks.
16,000 Sapiens settle America. Extinction of American megafauna.

13,000

Extinction of Homo floresiensis. Homo sapiens the only surviving human

species.

12,000

The Agricultural Revolution. Domestication of plants and animals.

Permanent settlements.

5,000

First kingdoms, script and money. Polytheistic religions.

4,250

First empire – the Akkadian Empire of Sargon.
{'source': 'data/books/index.pdf', 'start_index': 3085}
Please ask a question related to the PDF file!
