In [1]:
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

True

### Understanding InMemory Store in LangChain

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = r"data\montreal.pdf"

loader = PyPDFLoader(file_path=file_path)

# by default, we will split by pages with no text_splitter
documents = loader.load_and_split(text_splitter=None)
documents

[Document(page_content="Things to Do in Montreal \nMontreal is a vibrant city with a rich cultural heritage and an array of \nactivities to suit every interest. Start your exploration in the historic \nOld Montreal, where cobblestone streets and 17th-century \narchitecture transport you back in time. Visit the stunning Notre-\nDame Basilica, renowned for its intricate interior and dramatic light \nshows. For a taste of the local arts scene, head to the Montreal \nMuseum of Fine Arts, home to an impressive collection of Canadian \nand international works. If you're an outdoor enthusiast, Mont Royal \noƯers scenic hiking trails and panoramic views of the city. In the \nsummer, the park becomes a hub for picnics and outdoor events, \nincluding the popular Tam-Tams festival, where locals gather to \nenjoy music and dance. \nFor a modern twist, the Mile End district is a must-visit. Known for its \nbohemian vibe, this area is packed with indie boutiques, art \ngalleries, and eclectic cafes.

### Unsing the InMemoryStore

In [3]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from database import COLLECTION_NAME

vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=OpenAIEmbeddings()
)

store = InMemoryStore()

id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

retriever

MultiVectorRetriever(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001E1CA7BC650>, docstore=<langchain_core.stores.InMemoryBaseStore object at 0x000001E1CC1FA490>)

### Unsing the PostgresByteStore

In [4]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_postgres import PGVector
from database import COLLECTION_NAME, CONNECTION_STRING
from store import PostgresByteStore
from langchain_postgres import PostgresSaver, PickleCheckpointSerializer

embeddings = OpenAIEmbeddings()
vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=COLLECTION_NAME,
    connection=CONNECTION_STRING,
    use_jsonb=True,
)

store = PostgresByteStore(CONNECTION_STRING, COLLECTION_NAME)
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

retriever

MultiVectorRetriever(vectorstore=<langchain_postgres.vectorstores.PGVector object at 0x000001E1CC3AC450>, docstore=<store.PostgresByteStore object at 0x000001E1CC6F5190>)

In [5]:
import uuid

doc_ids = [str(uuid.uuid4()) for _ in documents]
doc_ids

['2643e98e-c01e-4d7d-aed7-2bf9e345a976',
 '4ea1b189-527b-4e85-9426-b78f90a54d72']

In [6]:
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

all_sub_docs = []
for i, doc in enumerate(documents):
    doc_id = doc_ids[i]
    sub_docs = child_text_splitter.split_documents([doc])
    for sub_doc in sub_docs:
        sub_doc.metadata[id_key] = doc_id
    all_sub_docs.extend(sub_docs)
    
all_sub_docs

[Document(page_content='Things to Do in Montreal \nMontreal is a vibrant city with a rich cultural heritage and an array of \nactivities to suit every interest. Start your exploration in the historic \nOld Montreal, where cobblestone streets and 17th-century \narchitecture transport you back in time. Visit the stunning Notre-\nDame Basilica, renowned for its intricate interior and dramatic light', metadata={'source': 'data\\montreal.pdf', 'page': 0, 'doc_id': '2643e98e-c01e-4d7d-aed7-2bf9e345a976'}),
 Document(page_content="Old Montreal, where cobblestone streets and 17th-century \narchitecture transport you back in time. Visit the stunning Notre-\nDame Basilica, renowned for its intricate interior and dramatic light \nshows. For a taste of the local arts scene, head to the Montreal \nMuseum of Fine Arts, home to an impressive collection of Canadian \nand international works. If you're an outdoor enthusiast, Mont Royal", metadata={'source': 'data\\montreal.pdf', 'page': 0, 'doc_id': '2

In [7]:
retriever.vectorstore.add_documents(all_sub_docs)
retriever.docstore.mset(list(zip(doc_ids, documents)))

### Testing the retriever

In [8]:
retriever.vectorstore.similarity_search("What are some unique seasonal events in Montreal that a visitor should not miss?")

[Document(page_content='What are some famous festivals that take place in Montreal?', metadata={'doc_id': '0e1f21b0-3d62-41e2-b885-4533ff4f4b05'}),
 Document(page_content='What can one do in Montreal during the winter?', metadata={'doc_id': '0e1f21b0-3d62-41e2-b885-4533ff4f4b05'}),
 Document(page_content='What outdoor activities can you enjoy in Montreal?', metadata={'doc_id': '0e1f21b0-3d62-41e2-b885-4533ff4f4b05'}),
 Document(page_content='What are some of the historic sites to visit in Montreal?', metadata={'doc_id': '0e1f21b0-3d62-41e2-b885-4533ff4f4b05'})]

In [9]:
retriever.invoke("What are some unique seasonal events in Montreal that a visitor should not miss?")

[Document(page_content="Things to Do in Montreal \nMontreal is a vibrant city with a rich cultural heritage and an array of \nactivities to suit every interest. Start your exploration in the historic \nOld Montreal, where cobblestone streets and 17th-century \narchitecture transport you back in time. Visit the stunning Notre-\nDame Basilica, renowned for its intricate interior and dramatic light \nshows. For a taste of the local arts scene, head to the Montreal \nMuseum of Fine Arts, home to an impressive collection of Canadian \nand international works. If you're an outdoor enthusiast, Mont Royal \noƯers scenic hiking trails and panoramic views of the city. In the \nsummer, the park becomes a hub for picnics and outdoor events, \nincluding the popular Tam-Tams festival, where locals gather to \nenjoy music and dance. \nFor a modern twist, the Mile End district is a must-visit. Known for its \nbohemian vibe, this area is packed with indie boutiques, art \ngalleries, and eclectic cafes.

### Creating Summaries for Each Parent Chunk

In [10]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

prompt_text = """You are an assistant tasked with summarizing text. \
Directly summarize the following text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Initialize the Language Model (LLM)
model = ChatOpenAI(temperature=0, model="gpt-4o")

# Define the summary chain
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [11]:
parent_chunk = [i.page_content for i in documents]
text_summaries = summarize_chain.batch(parent_chunk, {"max_concurrency": 5})

In [12]:
text_summaries

["Montreal offers a diverse range of activities for all interests. Explore historic Old Montreal with its cobblestone streets and 17th-century architecture, and visit the Notre-Dame Basilica. The Montreal Museum of Fine Arts showcases Canadian and international art. Outdoor enthusiasts can enjoy Mont Royal's hiking trails and panoramic views, as well as summer events like the Tam-Tams festival. The Mile End district features a bohemian atmosphere with indie boutiques, art galleries, and street art. In winter, the Underground City offers 30 kilometers of sheltered pedestrian pathways. Montreal is also renowned for its festivals, including the Montreal International Jazz Festival and Just for Laughs.",
 "Montreal's culinary scene is diverse and exciting, featuring iconic spots like Schwartz’s Deli for smoked meat sandwiches and La Banquise for various poutine dishes. For fine dining, Toqué! offers an innovative menu with seasonal Quebecois flavors, while Joe Beef combines French techniqu

In [13]:
from langchain.schema.document import Document

summary_docs = []
for i, (summary, doc_id) in enumerate(zip(text_summaries, doc_ids)):
    # Define your new metadata here
    new_metadata = {"page": i, "doc_id": doc_id}

    # Create a new Document instance for each summary
    doc = Document(page_content=str(summary))

    # Replace the metadata
    doc.metadata = new_metadata

    # Add the Document to the list
    summary_docs.append(doc)


In [14]:
summary_docs

[Document(page_content="Montreal offers a diverse range of activities for all interests. Explore historic Old Montreal with its cobblestone streets and 17th-century architecture, and visit the Notre-Dame Basilica. The Montreal Museum of Fine Arts showcases Canadian and international art. Outdoor enthusiasts can enjoy Mont Royal's hiking trails and panoramic views, as well as summer events like the Tam-Tams festival. The Mile End district features a bohemian atmosphere with indie boutiques, art galleries, and street art. In winter, the Underground City offers 30 kilometers of sheltered pedestrian pathways. Montreal is also renowned for its festivals, including the Montreal International Jazz Festival and Just for Laughs.", metadata={'page': 0, 'doc_id': '2643e98e-c01e-4d7d-aed7-2bf9e345a976'}),
 Document(page_content="Montreal's culinary scene is diverse and exciting, featuring iconic spots like Schwartz’s Deli for smoked meat sandwiches and La Banquise for various poutine dishes. For f

In [15]:
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, documents)))

### Generating Hypothetical Questions for Each Parent Chunk

In [16]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [17]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

question_chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 5 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        """Generate a list of exactly 5 hypothetical questions that the below document could be used to answer:\n\n{doc}
        seperate each question with a comma (,)
        """
    )
    | ChatOpenAI(max_retries=0, model="gpt-4o").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [18]:
hypothetical_questions = question_chain.batch(documents, {"max_concurrency": 5})

In [19]:
hypothetical_questions

[['What are some historical sites to visit in Montreal?',
  "Where can you experience Montreal's local arts scene?",
  'What outdoor activities are available in Montreal?',
  'What is unique about the Mile End district?',
  'What can visitors do in Montreal during the winter?'],
 ["What are the best places to experience Montreal's culinary scene?",
  'Where can I find the best smoked meat sandwiches in Montreal?',
  'What are some must-try dishes unique to Montreal?',
  'Which restaurants in Montreal offer fine dining experiences?',
  'Where can I find the best bagels in Montreal?']]

In [21]:
from langchain.schema.document import Document

hypothetical_docs = []
for question_list, doc_id in zip(hypothetical_questions, doc_ids):
    for question in question_list:
        # Define your new metadata here
        new_metadata = {"doc_id": doc_id}

        # Create a new Document instance for each question
        # The question itself is the page_content
        doc = Document(page_content=question, metadata=new_metadata)

        # Add the Document to the list
        hypothetical_docs.append(doc)

In [22]:
hypothetical_docs

[Document(page_content='What are some historical sites to visit in Montreal?', metadata={'doc_id': '2643e98e-c01e-4d7d-aed7-2bf9e345a976'}),
 Document(page_content="Where can you experience Montreal's local arts scene?", metadata={'doc_id': '2643e98e-c01e-4d7d-aed7-2bf9e345a976'}),
 Document(page_content='What outdoor activities are available in Montreal?', metadata={'doc_id': '2643e98e-c01e-4d7d-aed7-2bf9e345a976'}),
 Document(page_content='What is unique about the Mile End district?', metadata={'doc_id': '2643e98e-c01e-4d7d-aed7-2bf9e345a976'}),
 Document(page_content='What can visitors do in Montreal during the winter?', metadata={'doc_id': '2643e98e-c01e-4d7d-aed7-2bf9e345a976'}),
 Document(page_content="What are the best places to experience Montreal's culinary scene?", metadata={'doc_id': '4ea1b189-527b-4e85-9426-b78f90a54d72'}),
 Document(page_content='Where can I find the best smoked meat sandwiches in Montreal?', metadata={'doc_id': '4ea1b189-527b-4e85-9426-b78f90a54d72'}),
 D

In [23]:
retriever.vectorstore.add_documents(hypothetical_docs)
retriever.docstore.mset(list(zip(doc_ids, documents)))

In [24]:
retriever.vectorstore.similarity_search("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

[Document(page_content='What are some of the fine dining options available in Montreal?', metadata={'doc_id': 'e839cb13-4465-449e-8d89-253f70a0f96f'}),
 Document(page_content='Which restaurants in Montreal offer fine dining experiences?', metadata={'doc_id': '4ea1b189-527b-4e85-9426-b78f90a54d72'}),
 Document(page_content="What are the best places to experience Montreal's culinary scene?", metadata={'doc_id': '4ea1b189-527b-4e85-9426-b78f90a54d72'}),
 Document(page_content='What are some must-try dishes unique to Montreal?', metadata={'doc_id': '4ea1b189-527b-4e85-9426-b78f90a54d72'})]

In [25]:
retriever.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

[Document(page_content='Dining in Montreal \nMontreal’s culinary scene is just as diverse and exciting as its \nactivities. Start your gastronomic journey with a visit to Schwartz’s \nDeli, an iconic establishment known for its mouth-watering smoked \nmeat sandwiches. For a true taste of Montreal, try poutine – a \ndelectable dish of fries topped with cheese curds and gravy – \navailable at La Banquise, a local favorite that o Ưers numerous \nvariations of this classic comfort food. \nFor ﬁne dining, Toqué! is a standout choice, o Ưering an innovative \nmenu that highlights seasonal ingredients and Quebecois ﬂavors. \nFoodies will also appreciate Joe Beef, a beloved bistro that \ncombines French culinary techniques with hearty, local ingredients. \nIf you’re in the mood for something sweet, a stop at Fairmount Bagel \nor St-Viateur Bagel is essential. These bagel shops have been \nserving freshly baked, wood-ﬁred bagels for decades and are a \nstaple of Montreal’s food culture. \nIn th

### Creating an LCEL Chain and Testing the Retriever

In [26]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Prompt template
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4o")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [27]:
chain.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

'For those interested in Middle Eastern cuisine, Damas is a high-end Syrian restaurant in Montreal known for its exquisite Middle Eastern cuisine.'

In [28]:
chain.invoke("Where can I find the best smoked meat sandwiches in Montreal?")

'You can find the best smoked meat sandwiches in Montreal at Schwartz’s Deli.'

In [29]:
chain.invoke("Where can I find the best food in Montreal?")

'The best food in Montreal can be found at several notable establishments:\n\n1. **Schwartz’s Deli** - Known for its mouth-watering smoked meat sandwiches.\n2. **La Banquise** - Famous for its poutine, offering numerous variations of this classic comfort food.\n3. **Toqué!** - A fine dining restaurant with an innovative menu highlighting seasonal ingredients and Quebecois flavors.\n4. **Joe Beef** - A beloved bistro combining French culinary techniques with hearty, local ingredients.\n5. **Fairmount Bagel and St-Viateur Bagel** - Essential stops for freshly baked, wood-fired bagels.\n6. **Candide** - Offers contemporary dishes with a focus on local and sustainable ingredients.\n7. **Damas** - A high-end Syrian restaurant known for its exquisite Middle Eastern cuisine.\n\nThese diverse dining options ensure that Montreal offers something for every palate.'