In [8]:
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

True

### Understanding Stores in LangChain

In [14]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = r"data\montreal.pdf"

loader = PyPDFLoader(file_path=file_path)

# by default, we will split by pages with no text_splitter
documents = loader.load_and_split(text_splitter=None)
documents

[Document(page_content='Things to Do in Toronto \nPage 1: Introduction \nToronto, the capital of Ontario, is the largest city in Canada and a dynamic, cosmopolitan \nhub. Known for its towering skyline, bustling waterfront, and numerous cultural attractions, \nToronto o Ưers a wealth of experiences for every visitor. Very Nice!! \nKey Attractions:  \n\uf0b7 CN Tower:  This iconic symbol of Toronto o Ưers panoramic views of the city. Don’t \nmiss the glass ﬂoor and the revolving restaurant at the top. \n\uf0b7 Royal Ontario Museum (ROM):  Canada’s largest museum of world cultures and \nnatural history is a must-visit. \n\uf0b7 Toronto Islands:  A group of small islands located just o Ư the city’s shore, o Ưering \nbeautiful beaches, picnic spots, and bike rentals.', metadata={'source': 'data\\toronto.pdf', 'page': 0}),
 Document(page_content='Page 2: Cultural Experiences \nToronto is a melting pot of cultures, and this is reﬂected in its neighborhoods and festivals. \nNeighborhoods:  \n

### Unsing the PostgresByteStore

In [15]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_postgres import PGVector
from database import COLLECTION_NAME, CONNECTION_STRING
from utils.store import PostgresByteStore
from langchain_postgres import PostgresSaver, PickleCheckpointSerializer
from langchain.indexes import SQLRecordManager, index

embeddings = OpenAIEmbeddings()
vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=COLLECTION_NAME,
    connection=CONNECTION_STRING,
    use_jsonb=True,
)

store = PostgresByteStore(CONNECTION_STRING, COLLECTION_NAME)
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

# define record manager
namespace = f"pgvector/{COLLECTION_NAME}"
record_manager = SQLRecordManager(
    namespace, db_url=CONNECTION_STRING
)
record_manager.create_schema()

retriever

MultiVectorRetriever(vectorstore=<langchain_postgres.vectorstores.PGVector object at 0x00000224D86E5510>, docstore=<utils.store.PostgresByteStore object at 0x00000224ACAE58D0>)

In [16]:
import uuid

# Add a unique doc_id to each document's metadata
for doc in documents:
    doc.metadata["doc_id"] = str(uuid.uuid4())

In [None]:
documents

In [17]:
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Assume documents is a list of Document objects with 'doc_id' in their metadata
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

all_sub_docs = []
for doc in documents:
    doc_id = doc.metadata.get("doc_id")  # Retrieve the doc_id from the document's metadata
    sub_docs = child_text_splitter.split_documents([doc])
    for sub_doc in sub_docs:
        sub_doc.metadata["doc_id"] = doc_id  # Assign the same doc_id to each sub-document
    all_sub_docs.extend(sub_docs)

# Display the resulting sub-documents
all_sub_docs


[Document(page_content='Things to Do in Toronto \nPage 1: Introduction \nToronto, the capital of Ontario, is the largest city in Canada and a dynamic, cosmopolitan \nhub. Known for its towering skyline, bustling waterfront, and numerous cultural attractions, \nToronto o Ưers a wealth of experiences for every visitor. Very Nice!! \nKey Attractions:', metadata={'source': 'data\\toronto.pdf', 'page': 0, 'doc_id': 'a59b179b-8fa9-49b2-97d7-a40f3cf06259'}),
 Document(page_content='hub. Known for its towering skyline, bustling waterfront, and numerous cultural attractions, \nToronto o Ưers a wealth of experiences for every visitor. Very Nice!! \nKey Attractions:  \n\uf0b7 CN Tower:  This iconic symbol of Toronto o Ưers panoramic views of the city. Don’t \nmiss the glass ﬂoor and the revolving restaurant at the top.', metadata={'source': 'data\\toronto.pdf', 'page': 0, 'doc_id': 'a59b179b-8fa9-49b2-97d7-a40f3cf06259'}),
 Document(page_content='Key Attractions:  \n\uf0b7 CN Tower:  This iconic 

In [18]:
idx = index(all_sub_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")
# Generate the list of (doc_id, document) tuples from the documents
doc_id_document_tuples = [(doc.metadata["doc_id"], doc) for doc in documents]

# Pass the list of tuples to retriever.docstore.mset
retriever.docstore.mset(doc_id_document_tuples)

In [7]:
idx

{'num_added': 13, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

### Creating Summaries for Each Parent Chunk

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

prompt_text = """You are an assistant tasked with summarizing text. \
Directly summarize the following text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Initialize the Language Model (LLM)
model = ChatOpenAI(temperature=0, model="gpt-4o")

# Define the summary chain
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [None]:
parent_chunk = [i.page_content for i in documents]
text_summaries = summarize_chain.batch(parent_chunk, {"max_concurrency": 5})

In [None]:
text_summaries

In [None]:
from langchain.schema.document import Document
import copy

# Extract metadata from documents and create a deep copy of it
metadata = [copy.deepcopy(i.metadata) for i in documents]

# Create new Document objects with the summaries and the original metadata
text_summaries_with_metadata = []
for summary, meta in zip(text_summaries, metadata):
    # Add the suffix "(summary)" to the source in metadata
    if 'source' in meta:
        meta['source'] += ' (summary)'
    text_summaries_with_metadata.append(Document(page_content=summary, metadata=meta))


In [None]:
text_summaries_with_metadata

In [None]:
idx = index(text_summaries_with_metadata, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")
# Generate the list of (doc_id, document) tuples from the documents
doc_id_document_tuples = [(doc.metadata["doc_id"], doc) for doc in documents]

# Pass the list of tuples to retriever.docstore.mset
retriever.docstore.mset(doc_id_document_tuples)

In [None]:
idx

### Generating Hypothetical Questions for Each Parent Chunk

In [None]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [None]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

question_chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 5 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        """Generate a list of exactly 5 hypothetical questions that the below document could be used to answer:\n\n{doc}
        seperate each question with a comma (,)
        """
    )
    | ChatOpenAI(max_retries=0, model="gpt-4o").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [None]:
hypothetical_questions = question_chain.batch(documents, {"max_concurrency": 5})

In [None]:
hypothetical_questions

In [None]:
from langchain.schema.document import Document

hypothetical_docs = []
for original_doc, question_list in zip(documents, hypothetical_questions):
    doc_id = original_doc.metadata["doc_id"]
    page = original_doc.metadata.get("page")
    source = original_doc.metadata.get("source")

    for question in question_list:
        # Define your new metadata here with modified source
        new_metadata = {
            "doc_id": doc_id,
            "page": page,
            "source": f"{source}(question)"  # Add the suffix to the source
        }

        # Create a new Document instance for each question
        # The question itself is the page_content
        doc = Document(page_content=question, metadata=new_metadata)

        # Add the Document to the list
        hypothetical_docs.append(doc)

In [None]:
hypothetical_docs

In [None]:
idx = index(hypothetical_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")

# Generate the list of (doc_id, document) tuples from the documents
doc_id_document_tuples = [(doc.metadata["doc_id"], doc) for doc in documents]

# Pass the list of tuples to retriever.docstore.mset
retriever.docstore.mset(doc_id_document_tuples)

In [None]:
idx

In [None]:
retriever.vectorstore.similarity_search("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

In [None]:
retriever.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

### Creating an LCEL Chain and Testing the Retriever

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Prompt template
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4o")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
chain.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

In [None]:
chain.invoke("Where can I find the best smoked meat sandwiches in Montreal?")

In [None]:
chain.invoke("Where can I find the best food in Montreal?")