In [13]:
import sys
import os
import getpass
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import TextLoader
from langchain.embeddings import GooglePalmEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import GooglePalm
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate

In [14]:
api_key = getpass.getpass("Enter your OpenAI API key: ")

In [15]:
# create an empty list to store the documents
documents = []

# loop through all files in the "docs" directory
for file in os.listdir("docs"):
    # if the file is a PDF, load it using the PyPDFLoader and add the resulting documents to the list
    if file.endswith(".pdf"):
        pdf_path = "./docs/" + file
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())
    # if the file is a Word document, load it using the Docx2txtLoader and add the resulting documents to the list
    elif file.endswith('.docx') or file.endswith('.doc'):
        doc_path = "./docs/" + file
        loader = Docx2txtLoader(doc_path)
        documents.extend(loader.load())
    # if the file is a plain text file, load it using the TextLoader and add the resulting documents to the list
    elif file.endswith('.txt'):
        text_path = "./docs/" + file
        loader = TextLoader(text_path)
        documents.extend(loader.load())

In [16]:
# create a text splitter object with a chunk size of 1000 and overlap of 10
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)

# split the documents into chunks using the text splitter
documents = text_splitter.split_documents(documents)

In [None]:
# create a vector database using the Chroma vector store, with documents as input
# use the GooglePalmEmbeddings for embedding, and persist the database to the "./data" directory
vectordb = Chroma.from_documents(documents, embedding=GooglePalmEmbeddings(google_api_key=api_key), persist_directory="./data")

# persist the vector database
vectordb.persist()

In [17]:
# create a ConversationalRetrievalChain object for PDF question answering
pdf_qa = ConversationalRetrievalChain.from_llm(
    GooglePalm(google_api_key=api_key),  # use GooglePalm for language modeling
    vectordb.as_retriever(search_kwargs={'k': 6}),  # use the Chroma vector store for document retrieval
    return_source_documents=True,  # return the source documents along with the answers
    verbose=False  # do not print verbose output
)

In [18]:
# create an empty list to store the chat history
chat_history = []

# print a welcome message to the user
print("---------------------------------------------------------------------------------")
print('Welcome to the DocBot. You are now ready to start interacting with your documents')
print('---------------------------------------------------------------------------------')

---------------------------------------------------------------------------------
Welcome to the DocBot. You are now ready to start interacting with your documents
---------------------------------------------------------------------------------


In [19]:
# create an interactive loop to prompt the user for questions
while True:
    # prompt the user for a question
    query = input("Prompt: ")
    
    # check if the user wants to exit the loop
    if query == "exit" or query == "quit" or query == "q" or query == "f":
        print('Exiting')
        sys.exit()
    
    # check if the user entered an empty query
    if query == '':
        continue
    
    # use the ConversationalRetrievalChain to find the answer to the user's question
    result = pdf_qa({"question": query, "chat_history": chat_history})
    
    # print the answer to the user
    print("Answer: " + result["answer"])
    
    # add the user's question and the resulting answer to the chat history
    chat_history.append((query, result["answer"]))

Answer: Large Language Models as Optimizers (OPRO) is a framework that uses large language models (LLMs) to optimize for different objectives. OPRO can be used for tasks such as prompt optimization, where the goal is to find a prompt that optimizes the task accuracy, and code generation, where the goal is to generate code that solves a given problem.
Answer: Varun Nair, Elliot Schumacher, Geoffrey Tso, and Anitha Kannan
Answer: GSM8K GSM8K (Grade School Math 8K) is a dataset of 7,473 grade-school math word problems
from the Math 8 curriculum in the United States. The dataset is split into a training set of 6,223
examples and a test set of 1,250 examples.

Exiting


SystemExit: 