In [27]:
import os
import sys
import logging

from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain, ConversationChain
from langchain.retrievers import BM25Retriever, EnsembleRetriever, MultiQueryRetriever, ContextualCompressionRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter, SpacyTextSplitter
from langchain.vectorstores import FAISS
from langchain.agents import Tool, AgentType, initialize_agent
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document

log = logging.getLogger("ChatBot")
logging.basicConfig(level=logging.WARNING)
os.environ["OPENAI_API_KEY"] = "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"
os.environ["TOKENIZERS_PARALLELISM"] = "TRUE"
# api_key = os.environ.get('OPENAI_API_KEY')

file_path = "chapter6.pdf"


# os.system("python -m spacy download en")


def log_messages(string):
    print(f" 🤖💬️  {string}")

In [29]:
class PDFChatbot:
    def __init__(self, file_path): 
        log_messages(f'FileLoad|{file_path}|Attempting')
        try:
            document = PyPDFLoader(file_path=file_path).load()
            log_messages(f'FileLoad|{file_path}|Successful')
        except:
            if not os.path.exists(file_path):
                log_messages("File not found, please try with another file")
            else:
                log_messages('Unable to open file, please check the integrity of the file')

        log_messages("TextSplitter|Instantiation|Start") 
        full_text = ''
        for sub_doc in document:
            full_text += sub_doc.page_content
        
        self.document = Document(page_content=full_text, metadata={"source": file_path})
        self.text_splitter = SpacyTextSplitter(pipeline='en_core_web_sm', chunk_overlap=256)

        log_messages("TextSplitter|Instantiation|End")
        log_messages("TextSplitter|split_documents|Start")
        documents = self.text_splitter.split_documents(documents=[self.document])
        log_messages("TextSplitter|split_documents|End")
        
        log_messages("ChatOpenAI|Model|loading|")
        chat = ChatOpenAI()
        log_messages("ChatOpenAI|model|loaded|")

        self.retreivers_load()

    
    def retreivers_load(self):
        
        self.bm25_retriever = BM25Retriever.from_documents(documents)
        self.bm25_retriever.k = 5
        self.embedding = OpenAIEmbeddings()
        self.vectorstore = FAISS.from_documents(documents=documents, embedding=self.embedding)
        self.faiss_retriever = self.vectorstore.as_retriever(search_kwargs={"k": 3})
        self.multiquery_retriever = MultiQueryRetriever.from_llm(retriever=self.vectorstore.as_retriever(), llm=chat)
        self.compressor = LLMChainExtractor.from_llm(chat)
        self.compression_retriever = ContextualCompressionRetriever(base_compressor=self.compressor,
                                                                    base_retriever=self.faiss_retriever)
        self.ensemble_retriever = EnsembleRetriever(
            retrievers=[self.multiquery_retriever, self.compression_retriever, self.bm25_retriever],
            weights=[0.2, 0.6, 0.1])

    def get_top_document(self, query):
        return self.ensemble_retriever.get_relevant_documents(query=query)


pdf_chatbot = PDFChatbot(file_path=file_path)

 🤖💬️  FileLoad|chapter6.pdf|Attempting
 🤖💬️  FileLoad|chapter6.pdf|Successful
 🤖💬️  TextSplitter|Instantiation|Start
 🤖💬️  TextSplitter|Instantiation|End
 🤖💬️  TextSplitter|split_documents|Start
 🤖💬️  TextSplitter|split_documents|End
 🤖💬️  ChatOpenAI|Model|loading|
 🤖💬️  ChatOpenAI|model|loaded|


In [30]:
prompt_template = """Answer the following question based on the context only and nothing else. 
                    if the user is asking for instructions, return the answer as bullets
                    context: {context}
                    question: {user_input}
                    answer: """
prompt_template = PromptTemplate.from_template(template=prompt_template)
llm_chain = LLMChain(llm=chat, prompt=prompt_template)

In [25]:
user_input = "are there zones in the rest area with lesser time?"

contexts = pdf_chatbot.get_top_document(user_input)
if len(contexts) > 0:
    context = contexts[0]
    output = llm_chain(inputs={'context': context, 'user_input': user_input})
    print(output['text'], f"\n\nContext: {context}")
else:
    print("Cannot answer the question given that there were not relevant documents returned")

- Yes, the department may designate zones within a Rest Area with shorter parking time limits for maximum efficiency and safety. 

Context: page_content='Rest Area parking is permitted by law RCW 47.38.020.\nRest Area parking is limited to eight hours within a twenty-four-hour period.\nThe department may designate zones within a Rest Area with shorter parking time limits for the purposes of maximum efficiency and safety.\nThe department shall post the appropriate signage consistent with RCW 46.55.070(1) at all Rest Areas regarding the parking time limits in this section.' metadata={'source': 'chapter6.pdf'}
