In [None]:
import os
import shutil
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever
from langchain.docstore.document import Document

from dotenv import load_dotenv
from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.globals import set_llm_cache
from langchain.cache import InMemoryCache

In [None]:
load_dotenv()
chat = ChatOpenAI()
set_llm_cache(InMemoryCache())

In [None]:
pdf_filepath = "../datasets/the_importance_of_being_earnest.pdf"

loader = PyPDFLoader(file_path=pdf_filepath)

data = loader.load()

data

In [None]:
full_data = "\n".join([doc.page_content.replace('\t', " ") for doc in data])
full_data = "\n".join([line.strip() for line in full_data.split('\n')])

In [None]:
print(full_data)

In [None]:
start_idx = len("""The Importance of Being Earnest
A Trivial Comedy for Serious People


Oscar Wilde



THE PERSONS IN THE PLAY

John Worthing, J.P.
Algernon Moncrieff
Rev. Canon Chasuble, D.D.
Merriman, Butler
Lane, Manservant
Lady Bracknell
Hon. Gwendolen Fairfax
Cecily Cardew
Miss Prism, Governess


THE SCENES OF THE PLAY

ACT I.  Algernon Moncrieff’s Flat in Half-Moon Street, W.
ACT II.  The Garden at the Manor House, Woolton.
ACT III.  Drawing-Room at the Manor House, Woolton.
TIME: The Present.
FIRST ACT
SCENE
Morning-room in Algernon’s flat in Half-Moon Street.  The room is luxuriously
and artistically furnished.  The sound of a piano is heard in the adjoining
room.
[Lane is arranging afternoon tea on the table, and after the music has ceased,
Algernon enters.]

""")

start_idx

In [None]:
end_idx = len("""


Liked This Book?
For More FREE e-Books visit
Freeditorial.com""")

end_idx

In [None]:
full_data = full_data[start_idx:-end_idx]

print(full_data)

In [None]:
parent_text_splitter = CharacterTextSplitter(
    separator=r"\w+\.\n",
    chunk_size=1200,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=True,
    keep_separator=True
)

parent_docs = parent_text_splitter.create_documents(texts=[full_data])

parent_docs

In [None]:
child_text_splitter = RecursiveCharacterTextSplitter(
    separators=[r'Algernon\.\n(.*?)\n(?:[A-Z][a-z]+\.\n|$)', r"\w+\.\n"],
    chunk_size=1000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=True,
    keep_separator=False
)

child_docs = child_text_splitter.create_documents(texts=[full_data])

In [None]:
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {"normalize_embeddings": True}

embedding_function = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

db_dirpath = "../output/the_importance_of_being_earnest.db"

if os.path.isdir(db_dirpath):
    shutil.rmtree(db_dirpath)
    
db = Chroma(persist_directory=db_dirpath, embedding_function=embedding_function)
store = InMemoryStore()

par_doc_retriever = ParentDocumentRetriever(
    vectorstore=db, 
    docstore=store, 
    child_splitter=child_text_splitter, 
    parent_splitter=parent_text_splitter
)

full_doc = Document(page_content=full_data, metadata = {"source": pdf_filepath})

par_doc_retriever.add_documents([full_doc])

In [None]:
query = "The work is getting difficult day by day"

In [None]:
# Let's create a function

def get_algernon_response(query):
    
    prompt = """Respond like Algernon

    Refer to the below in triple backticks
    ```
    {conversations}
    ```

    Here is my dialogue:
    {query}

    Just return the response as a plain string and nothing else"""
    
    matched_docs = par_doc_retriever.get_relevant_documents(query=query)
    res_conversations = [doc.page_content for doc in matched_docs if "Algernon." in doc.page_content]
    conversations = "\n\n".join(["\n".join([f"Conversation {i+1}:", conv]) for i, conv in enumerate(res_conversations)])

    human_message_prompt = HumanMessagePromptTemplate.from_template(prompt)
    chat_prompt = ChatPromptTemplate.from_messages([human_message_prompt])
    prompt = chat_prompt.format_prompt(query=query, conversations=conversations)
    response = chat(messages = prompt.to_messages()).content
    return response

In [None]:
get_algernon_response(query)

In [None]:
get_algernon_response("The sky is beautiful in London")