In [1]:
import os
from environs import Env

env = Env()
env.read_env(".env") # read .env file, if it exists

api_key = os.getenv("OPENAI_API_KEY")

In [8]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFium2Loader

In [4]:
loader = PyPDFium2Loader("docs/scope_and_sequence.pdf")
data = loader.load()

In [9]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
llm = OpenAI(openai_api_key=api_key)
docsearch = Chroma.from_documents(texts, embeddings)

In [10]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

In [11]:
qa.invoke("What is this document about?")

{'query': 'What is this document about?',
 'result': ' This document appears to be a set of educational standards for a fourth grade language arts and reading curriculum, focusing on vocabulary, context clues, grammar, spelling, and writing. It also includes standards for teaching positive character traits and citizenship.'}