In [1]:
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

True

### Understanding InMemory Store in LangChain

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = r"data\montreal.pdf"

loader = PyPDFLoader(file_path=file_path)

# by default, we will split by pages with no text_splitter
documents = loader.load_and_split(text_splitter=None)
documents

[Document(page_content="Things to Do in Montreal \nMontreal is a vibrant city with a rich cultural heritage and an array of \nactivities to suit every interest. Start your exploration in the historic \nOld Montreal, where cobblestone streets and 17th-century \narchitecture transport you back in time. Visit the stunning Notre-\nDame Basilica, renowned for its intricate interior and dramatic light \nshows. For a taste of the local arts scene, head to the Montreal \nMuseum of Fine Arts, home to an impressive collection of Canadian \nand international works. If you're an outdoor enthusiast, Mont Royal \noƯers scenic hiking trails and panoramic views of the city. In the \nsummer, the park becomes a hub for picnics and outdoor events, \nincluding the popular Tam-Tams festival, where locals gather to \nenjoy music and dance. \nFor a modern twist, the Mile End district is a must-visit. Known for its \nbohemian vibe, this area is packed with indie boutiques, art \ngalleries, and eclectic cafes.

### Unsing the InMemoryStore

In [3]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from database import COLLECTION_NAME

vectorstore = Chroma(
    collection_name=COLLECTION_NAME,
    embedding_function=OpenAIEmbeddings()
)

store = InMemoryStore()

id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

retriever

MultiVectorRetriever(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001B94DE63B90>, docstore=<langchain_core.stores.InMemoryBaseStore object at 0x000001B950A7CB10>)

### Unsing the PostgresByteStore

In [None]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_postgres import PGVector
from database import COLLECTION_NAME, CONNECTION_STRING
from store import PostgresByteStore
from langchain_postgres import PostgresSaver, PickleCheckpointSerializer

embeddings = OpenAIEmbeddings()
vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=COLLECTION_NAME,
    connection=CONNECTION_STRING,
    use_jsonb=True,
)

store = PostgresByteStore(CONNECTION_STRING, COLLECTION_NAME)
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

retriever

In [4]:
import uuid

doc_ids = [str(uuid.uuid4()) for _ in documents]
doc_ids

['3b849434-11f2-45db-b2fb-121ffe77f990',
 '3d68a4c4-1cd9-4995-9701-0b6d57576566']

In [5]:
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

all_sub_docs = []
for i, doc in enumerate(documents):
    doc_id = doc_ids[i]
    sub_docs = child_text_splitter.split_documents([doc])
    for sub_doc in sub_docs:
        sub_doc.metadata[id_key] = doc_id
    all_sub_docs.extend(sub_docs)
    
all_sub_docs

[Document(page_content='Things to Do in Montreal \nMontreal is a vibrant city with a rich cultural heritage and an array of \nactivities to suit every interest. Start your exploration in the historic \nOld Montreal, where cobblestone streets and 17th-century \narchitecture transport you back in time. Visit the stunning Notre-\nDame Basilica, renowned for its intricate interior and dramatic light', metadata={'source': 'data\\montreal.pdf', 'page': 0, 'doc_id': '3b849434-11f2-45db-b2fb-121ffe77f990'}),
 Document(page_content="Old Montreal, where cobblestone streets and 17th-century \narchitecture transport you back in time. Visit the stunning Notre-\nDame Basilica, renowned for its intricate interior and dramatic light \nshows. For a taste of the local arts scene, head to the Montreal \nMuseum of Fine Arts, home to an impressive collection of Canadian \nand international works. If you're an outdoor enthusiast, Mont Royal", metadata={'source': 'data\\montreal.pdf', 'page': 0, 'doc_id': '3

In [6]:
retriever.vectorstore.add_documents(all_sub_docs)
retriever.docstore.mset(list(zip(doc_ids, documents)))

### Testing the retriever

In [23]:
retriever.vectorstore.similarity_search("What are some unique seasonal events in Montreal that a visitor should not miss?")

[Document(page_content='Montreal is a lively city with a rich cultural heritage and a variety of activities. Visitors can explore the historic Old Montreal, visit the Notre-Dame Basilica, and enjoy the Montreal Museum of Fine Arts. Outdoor enthusiasts can hike Mont Royal, which offers scenic trails and city views. The Mile End district is known for its bohemian atmosphere, indie boutiques, art galleries, and vibrant street art. The Underground City offers a unique experience with over 30 kilometers of pedestrian pathways connecting various establishments. The city is also known for its festivals, including the Montreal International Jazz Festival and Just for Laughs.', metadata={'doc_id': '3b849434-11f2-45db-b2fb-121ffe77f990', 'page': 0}),
 Document(page_content='Things to Do in Montreal \nMontreal is a vibrant city with a rich cultural heritage and an array of \nactivities to suit every interest. Start your exploration in the historic \nOld Montreal, where cobblestone streets and 17t

In [24]:
retriever.invoke("What are some unique seasonal events in Montreal that a visitor should not miss?")

[Document(page_content="Things to Do in Montreal \nMontreal is a vibrant city with a rich cultural heritage and an array of \nactivities to suit every interest. Start your exploration in the historic \nOld Montreal, where cobblestone streets and 17th-century \narchitecture transport you back in time. Visit the stunning Notre-\nDame Basilica, renowned for its intricate interior and dramatic light \nshows. For a taste of the local arts scene, head to the Montreal \nMuseum of Fine Arts, home to an impressive collection of Canadian \nand international works. If you're an outdoor enthusiast, Mont Royal \noƯers scenic hiking trails and panoramic views of the city. In the \nsummer, the park becomes a hub for picnics and outdoor events, \nincluding the popular Tam-Tams festival, where locals gather to \nenjoy music and dance. \nFor a modern twist, the Mile End district is a must-visit. Known for its \nbohemian vibe, this area is packed with indie boutiques, art \ngalleries, and eclectic cafes.

### Creating Summaries for Each Parent Chunk

In [71]:
prompt_text = """You are an assistant tasked with summarizing text. \
Directly summarize the following text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Initialize the Language Model (LLM)
model = ChatOpenAI(temperature=0, model="gpt-4")

# Define the summary chain
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [18]:
parent_chunk = [i.page_content for i in documents]
text_summaries = summarize_chain.batch(parent_chunk, {"max_concurrency": 5})

In [19]:
text_summaries

['Montreal is a lively city with a rich cultural heritage and a variety of activities. Visitors can explore the historic Old Montreal, visit the Notre-Dame Basilica, and enjoy the Montreal Museum of Fine Arts. Outdoor enthusiasts can hike Mont Royal, which offers scenic trails and city views. The Mile End district is known for its bohemian atmosphere, indie boutiques, art galleries, and vibrant street art. The Underground City offers a unique experience with over 30 kilometers of pedestrian pathways connecting various establishments. The city is also known for its festivals, including the Montreal International Jazz Festival and Just for Laughs.',
 "Montreal's diverse culinary scene includes iconic establishments like Schwartz’s Deli, known for smoked meat sandwiches, and La Banquise, famous for its poutine. For fine dining, Toqué! offers a menu with seasonal ingredients and Quebecois flavors, while Joe Beef combines French culinary techniques with local ingredients. Sweet lovers can v

In [20]:
from langchain.schema.document import Document

summary_docs = []
for i, (summary, doc_id) in enumerate(zip(text_summaries, doc_ids)):
    # Define your new metadata here
    new_metadata = {"page": i, "doc_id": doc_id}

    # Create a new Document instance for each summary
    doc = Document(page_content=str(summary))

    # Replace the metadata
    doc.metadata = new_metadata

    # Add the Document to the list
    summary_docs.append(doc)


In [21]:
summary_docs

[Document(page_content='Montreal is a lively city with a rich cultural heritage and a variety of activities. Visitors can explore the historic Old Montreal, visit the Notre-Dame Basilica, and enjoy the Montreal Museum of Fine Arts. Outdoor enthusiasts can hike Mont Royal, which offers scenic trails and city views. The Mile End district is known for its bohemian atmosphere, indie boutiques, art galleries, and vibrant street art. The Underground City offers a unique experience with over 30 kilometers of pedestrian pathways connecting various establishments. The city is also known for its festivals, including the Montreal International Jazz Festival and Just for Laughs.', metadata={'page': 0, 'doc_id': '3b849434-11f2-45db-b2fb-121ffe77f990'}),
 Document(page_content="Montreal's diverse culinary scene includes iconic establishments like Schwartz’s Deli, known for smoked meat sandwiches, and La Banquise, famous for its poutine. For fine dining, Toqué! offers a menu with seasonal ingredients

In [22]:
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, documents)))

In [41]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [45]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

question_chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 5 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        """Generate a list of exactly 5 hypothetical questions that the below document could be used to answer:\n\n{doc}
        seperate each question with a comma (,)
        """
    )
    | ChatOpenAI(max_retries=0, model="gpt-4").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [48]:
hypothetical_questions = question_chain.batch(documents, {"max_concurrency": 5})

In [47]:
hypothetical_questions

[['What can I expect to see in Old Montreal?',
  'What kind of art can be found at the Montreal Museum of Fine Arts?',
  'What activities are available at Mont Royal during the summer?',
  'What makes the Mile End district unique?',
  'What is the purpose of Montreal’s Underground City?'],
 ['What are some iconic food establishments in Montreal?',
  "Where can one find a true taste of Montreal's local food - poutine?",
  'What are some fine dining options in Montreal that highlight Quebecois flavors?',
  'Which restaurants in Montreal are known for their focus on local and sustainable ingredients?',
  'Which bagel shops in Montreal have been serving freshly baked, wood-fired bagels for decades?']]

In [64]:
from langchain.schema.document import Document

hypothetical_docs = []
for question_list, doc_id in zip(hypothetical_questions, doc_ids):
    for question in question_list:
        # Define your new metadata here
        new_metadata = {"doc_id": doc_id}

        # Create a new Document instance for each question
        # The question itself is the page_content
        doc = Document(page_content=question, metadata=new_metadata)

        # Add the Document to the list
        hypothetical_docs.append(doc)

In [65]:
hypothetical_docs

[Document(page_content='What are some of the activities you can do in Old Montreal?', metadata={'doc_id': '3b849434-11f2-45db-b2fb-121ffe77f990'}),
 Document(page_content='What is the Montreal Museum of Fine Arts known for?', metadata={'doc_id': '3b849434-11f2-45db-b2fb-121ffe77f990'}),
 Document(page_content='What outdoor activities are available in Mont Royal during the summer?', metadata={'doc_id': '3b849434-11f2-45db-b2fb-121ffe77f990'}),
 Document(page_content='What are some of the unique features of the Mile End district in Montreal?', metadata={'doc_id': '3b849434-11f2-45db-b2fb-121ffe77f990'}),
 Document(page_content='What facilities can be accessed through Montreal’s Underground City during winter?', metadata={'doc_id': '3b849434-11f2-45db-b2fb-121ffe77f990'}),
 Document(page_content='What are some iconic food establishments in Montreal?', metadata={'doc_id': '3d68a4c4-1cd9-4995-9701-0b6d57576566'}),
 Document(page_content='Where can I get the best poutine in Montreal?', metad

In [66]:
retriever.vectorstore.add_documents(hypothetical_docs)
retriever.docstore.mset(list(zip(doc_ids, documents)))

In [67]:
retriever.vectorstore.similarity_search("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

[Document(page_content='What dining options are available in Montreal for those interested in Middle Eastern cuisine?', metadata={'doc_id': '3d68a4c4-1cd9-4995-9701-0b6d57576566'}),
 Document(page_content='What are some fine dining options in Montreal?', metadata={'doc_id': '3d68a4c4-1cd9-4995-9701-0b6d57576566'}),
 Document(page_content='staple of Montreal’s food culture. \nIn the mood for something more exotic? Try Candide, which serves \ncontemporary dishes with a focus on local and sustainable \ningredients, or Damas, a high-end Syrian restaurant known for its \nexquisite Middle Eastern cuisine. With its diverse array of dining \noptions, Montreal truly o Ưers something for every palate.', metadata={'doc_id': '3d68a4c4-1cd9-4995-9701-0b6d57576566', 'page': 1, 'source': 'data\\montreal.pdf'}),
 Document(page_content='What are some iconic food establishments in Montreal?', metadata={'doc_id': '3d68a4c4-1cd9-4995-9701-0b6d57576566'})]

In [68]:
retriever.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

[Document(page_content='Dining in Montreal \nMontreal’s culinary scene is just as diverse and exciting as its \nactivities. Start your gastronomic journey with a visit to Schwartz’s \nDeli, an iconic establishment known for its mouth-watering smoked \nmeat sandwiches. For a true taste of Montreal, try poutine – a \ndelectable dish of fries topped with cheese curds and gravy – \navailable at La Banquise, a local favorite that o Ưers numerous \nvariations of this classic comfort food. \nFor ﬁne dining, Toqué! is a standout choice, o Ưering an innovative \nmenu that highlights seasonal ingredients and Quebecois ﬂavors. \nFoodies will also appreciate Joe Beef, a beloved bistro that \ncombines French culinary techniques with hearty, local ingredients. \nIf you’re in the mood for something sweet, a stop at Fairmount Bagel \nor St-Viateur Bagel is essential. These bagel shops have been \nserving freshly baked, wood-ﬁred bagels for decades and are a \nstaple of Montreal’s food culture. \nIn th

In [69]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Prompt template
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [70]:
chain.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

'Damas, a high-end Syrian restaurant known for its exquisite Middle Eastern cuisine, is a dining option available in Montreal for those interested in Middle Eastern cuisine.'