In [None]:
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

### Understanding Stores in LangChain

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = r"data\montreal.pdf"

loader = PyPDFLoader(file_path=file_path)

# by default, we will split by pages with no text_splitter
documents = loader.load_and_split(text_splitter=None)
documents

### Unsing the InMemoryStore

In [None]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from database import COLLECTION_NAME

vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=OpenAIEmbeddings()
)

store = InMemoryStore()

id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

retriever

### Unsing the PostgresByteStore

In [None]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_postgres import PGVector
from database import COLLECTION_NAME, CONNECTION_STRING
from utils.store import PostgresByteStore
from langchain_postgres import PostgresSaver, PickleCheckpointSerializer

embeddings = OpenAIEmbeddings()
vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=COLLECTION_NAME,
    connection=CONNECTION_STRING,
    use_jsonb=True,
)

store = PostgresByteStore(CONNECTION_STRING, COLLECTION_NAME)
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

retriever

In [None]:
import uuid

doc_ids = [str(uuid.uuid4()) for _ in documents]
doc_ids

In [None]:
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

all_sub_docs = []
for i, doc in enumerate(documents):
    doc_id = doc_ids[i]
    sub_docs = child_text_splitter.split_documents([doc])
    for sub_doc in sub_docs:
        sub_doc.metadata[id_key] = doc_id
    all_sub_docs.extend(sub_docs)
    
all_sub_docs

In [None]:
retriever.vectorstore.add_documents(all_sub_docs)
retriever.docstore.mset(list(zip(doc_ids, documents)))

### Testing the retriever

In [None]:
retriever.vectorstore.similarity_search("What are some unique seasonal events in Montreal that a visitor should not miss?")

In [None]:
retriever.invoke("What are some unique seasonal events in Montreal that a visitor should not miss?")

### Creating Summaries for Each Parent Chunk

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

prompt_text = """You are an assistant tasked with summarizing text. \
Directly summarize the following text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Initialize the Language Model (LLM)
model = ChatOpenAI(temperature=0, model="gpt-4o")

# Define the summary chain
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [None]:
parent_chunk = [i.page_content for i in documents]
text_summaries = summarize_chain.batch(parent_chunk, {"max_concurrency": 5})

In [None]:
text_summaries

In [None]:
from langchain.schema.document import Document

summary_docs = []
for i, (summary, doc_id) in enumerate(zip(text_summaries, doc_ids)):
    # Define your new metadata here
    new_metadata = {"page": i, "doc_id": doc_id}

    # Create a new Document instance for each summary
    doc = Document(page_content=str(summary))

    # Replace the metadata
    doc.metadata = new_metadata

    # Add the Document to the list
    summary_docs.append(doc)


In [None]:
summary_docs

In [None]:
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, documents)))

### Generating Hypothetical Questions for Each Parent Chunk

In [None]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [None]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

question_chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 5 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        """Generate a list of exactly 5 hypothetical questions that the below document could be used to answer:\n\n{doc}
        seperate each question with a comma (,)
        """
    )
    | ChatOpenAI(max_retries=0, model="gpt-4o").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [None]:
hypothetical_questions = question_chain.batch(documents, {"max_concurrency": 5})

In [None]:
hypothetical_questions

In [None]:
from langchain.schema.document import Document

hypothetical_docs = []
for question_list, doc_id in zip(hypothetical_questions, doc_ids):
    for question in question_list:
        # Define your new metadata here
        new_metadata = {"doc_id": doc_id}

        # Create a new Document instance for each question
        # The question itself is the page_content
        doc = Document(page_content=question, metadata=new_metadata)

        # Add the Document to the list
        hypothetical_docs.append(doc)

In [None]:
hypothetical_docs

In [None]:
retriever.vectorstore.add_documents(hypothetical_docs)
retriever.docstore.mset(list(zip(doc_ids, documents)))

In [None]:
retriever.vectorstore.similarity_search("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

In [None]:
retriever.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

### Creating an LCEL Chain and Testing the Retriever

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Prompt template
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4o")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
chain.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

In [None]:
chain.invoke("Where can I find the best smoked meat sandwiches in Montreal?")

In [None]:
chain.invoke("Where can I find the best food in Montreal?")