In [1]:
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

True

### Understanding Store in LangChain

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = r"data\montreal.pdf"

loader = PyPDFLoader(file_path=file_path)

# by default, we will split by pages with no text_splitter
documents = loader.load_and_split(text_splitter=None)
documents

[Document(page_content="Things to Do in Montreal \nMontréal is a vibrant city with a rich cultural heritage and an array of activities to suit every \ninterest. \nOld Montreal  \nDebute  your exploration in the historic Old Montreal, where cobblestone streets and 17th-\ncentury architecture transport you back in time. Visit the stunning Notre-Dame Basilica, \nrenowned for its intricate interior and dramatic light shows. \nMontreal Museum of Fine Arts  \nFor a taste of the local arts scene, head to the Montreal Museum of Fine Arts, home to an \nimpressive collection of Canadian and international works. \nMont Royal  \nIf you're an outdoor enthusiast, Mont Royal o Ưers scenic hiking trails and panoramic views \nof the city. In the summer, the park becomes a hub for picnics and outdoor events, \nincluding the popular Tam-Tams festival, where locals gather to enjoy music and dance. \nMile End District   \nFor a modern twist, the Mile End district is a must-visit. Known for its bohemian vib

### Unsing the PostgresByteStore

In [3]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_postgres import PGVector
from database import COLLECTION_NAME, CONNECTION_STRING
from utils.store import PostgresByteStore
from langchain_postgres import PostgresSaver, PickleCheckpointSerializer
from utils.custom_sql_record_manager import CustomSQLRecordManager
from utils.index_with_ids import index_with_ids

embeddings = OpenAIEmbeddings()
vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=COLLECTION_NAME,
    connection=CONNECTION_STRING,
    use_jsonb=True,
)

store = PostgresByteStore(CONNECTION_STRING, COLLECTION_NAME)
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

# define record manager
namespace = f"pgvector/{COLLECTION_NAME}"
record_manager = CustomSQLRecordManager(
    namespace, db_url=CONNECTION_STRING
)
record_manager.create_schema()

retriever

MultiVectorRetriever(vectorstore=<langchain_postgres.vectorstores.PGVector object at 0x00000263E43CD2D0>, docstore=<utils.store.PostgresByteStore object at 0x00000263E471BCD0>)

In [None]:
# import uuid

# # Add a unique doc_id to each document's metadata
# for doc in documents:
#     doc.metadata["doc_id"] = str(uuid.uuid4())

In [4]:
from utils.utils import generate_reproducible_id_by_content

# Add a reproducible unique doc_id to each document's metadata
for position, doc in enumerate(documents):
    doc.metadata["doc_id"] = generate_reproducible_id_by_content(doc.page_content, doc.metadata)

In [5]:
documents

[Document(page_content="Things to Do in Montreal \nMontréal is a vibrant city with a rich cultural heritage and an array of activities to suit every \ninterest. \nOld Montreal  \nDebute  your exploration in the historic Old Montreal, where cobblestone streets and 17th-\ncentury architecture transport you back in time. Visit the stunning Notre-Dame Basilica, \nrenowned for its intricate interior and dramatic light shows. \nMontreal Museum of Fine Arts  \nFor a taste of the local arts scene, head to the Montreal Museum of Fine Arts, home to an \nimpressive collection of Canadian and international works. \nMont Royal  \nIf you're an outdoor enthusiast, Mont Royal o Ưers scenic hiking trails and panoramic views \nof the city. In the summer, the park becomes a hub for picnics and outdoor events, \nincluding the popular Tam-Tams festival, where locals gather to enjoy music and dance. \nMile End District   \nFor a modern twist, the Mile End district is a must-visit. Known for its bohemian vib

In [6]:
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Assume documents is a list of Document objects with 'doc_id' in their metadata
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

all_sub_docs = []
for doc in documents:
    doc_id = doc.metadata.get("doc_id")  # Retrieve the doc_id from the document's metadata
    source = doc.metadata.get("source")  # Retrieve the source from the document's metadata
    sub_docs = child_text_splitter.split_documents([doc])
    for sub_doc in sub_docs:
        sub_doc.metadata["doc_id"] = doc_id  # Assign the same doc_id to each sub-document
        sub_doc.metadata["source"] = f"{source}(smaller chunk)"  # Add the suffix to the source
    all_sub_docs.extend(sub_docs)

# Display the resulting sub-documents
all_sub_docs


[Document(page_content='Things to Do in Montreal \nMontréal is a vibrant city with a rich cultural heritage and an array of activities to suit every \ninterest. \nOld Montreal  \nDebute  your exploration in the historic Old Montreal, where cobblestone streets and 17th-\ncentury architecture transport you back in time. Visit the stunning Notre-Dame Basilica, \nrenowned for its intricate interior and dramatic light shows.', metadata={'source': 'data\\montreal.pdf(smaller chunk)', 'page': 0, 'doc_id': '44840752-2ea4-51bd-9ece-fd7cf31454b9'}),
 Document(page_content='century architecture transport you back in time. Visit the stunning Notre-Dame Basilica, \nrenowned for its intricate interior and dramatic light shows. \nMontreal Museum of Fine Arts  \nFor a taste of the local arts scene, head to the Montreal Museum of Fine Arts, home to an \nimpressive collection of Canadian and international works. \nMont Royal', metadata={'source': 'data\\montreal.pdf(smaller chunk)', 'page': 0, 'doc_id':

In [7]:
idx = index_with_ids(all_sub_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")
# Generate the list of (doc_id, document) tuples from the documents
doc_id_document_tuples = [(doc.metadata["doc_id"], doc) for doc in documents]

# Pass the list of tuples to retriever.docstore.mset
retriever.docstore.mset(doc_id_document_tuples)

In [8]:
idx

{'status': 'success',
 'ids': [{'key': '3a4ade9a-4a8e-5b46-9ec8-112ced503fe6', 'operation': 'SKIP'},
  {'key': '6b1f540c-debf-58cc-9f82-861ebce92536', 'operation': 'SKIP'},
  {'key': 'f5281ed9-73b3-58a1-917c-36dff1b14ec1', 'operation': 'SKIP'},
  {'key': '4f6633c1-2b12-5f63-a447-979984220e2d', 'operation': 'SKIP'},
  {'key': '64627492-283d-5b95-af68-3187770e2877', 'operation': 'SKIP'},
  {'key': '0f5c3a6c-100b-5ed9-ad77-d2ebbe34f91e', 'operation': 'SKIP'},
  {'key': '44f33f3e-6046-51fb-8bb7-9ba26a4d30a3', 'operation': 'SKIP'},
  {'key': 'b84cc28f-308f-5042-b610-6e9c36e0368d', 'operation': 'SKIP'},
  {'key': 'e263e3b8-6a68-5793-8c9f-60415a95f23c', 'operation': 'SKIP'},
  {'key': '99408ed0-96ab-5eb4-9650-40497861fb9d', 'operation': 'SKIP'},
  {'key': 'a94e4047-3a07-5c88-b449-3f9876ae98a6', 'operation': 'SKIP'},
  {'key': 'd16ecc12-0c16-51f6-b280-80dbdbf3958c', 'operation': 'SKIP'},
  {'key': '3b9c4ef0-00a6-5c69-8a62-d49cdc5ca0cd', 'operation': 'SKIP'}],
 'results': [{'num_added': 0,
   '

### Creating Summaries for Each Parent Chunk

In [9]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

prompt_text = """You are an assistant tasked with summarizing text. \
Directly summarize the following text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Initialize the Language Model (LLM)
model = ChatOpenAI(temperature=0, model="gpt-4o")

# Define the summary chain
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [10]:
documents

[Document(page_content="Things to Do in Montreal \nMontréal is a vibrant city with a rich cultural heritage and an array of activities to suit every \ninterest. \nOld Montreal  \nDebute  your exploration in the historic Old Montreal, where cobblestone streets and 17th-\ncentury architecture transport you back in time. Visit the stunning Notre-Dame Basilica, \nrenowned for its intricate interior and dramatic light shows. \nMontreal Museum of Fine Arts  \nFor a taste of the local arts scene, head to the Montreal Museum of Fine Arts, home to an \nimpressive collection of Canadian and international works. \nMont Royal  \nIf you're an outdoor enthusiast, Mont Royal o Ưers scenic hiking trails and panoramic views \nof the city. In the summer, the park becomes a hub for picnics and outdoor events, \nincluding the popular Tam-Tams festival, where locals gather to enjoy music and dance. \nMile End District   \nFor a modern twist, the Mile End district is a must-visit. Known for its bohemian vib

In [11]:
parent_chunk = [i.page_content for i in documents]
text_summaries = summarize_chain.batch(parent_chunk, {"max_concurrency": 5})

In [None]:
# from langchain.schema.document import Document

# # Extract metadata from documents
# metadata = [i.metadata for i in documents]

# # Create new Document objects with the summaries and the original metadata
# text_summaries_with_metadata = [
#     Document(page_content=summary, metadata=meta)
#     for summary, meta in zip(text_summaries, metadata)
# ]

In [13]:
from langchain.schema.document import Document
import copy

# Extract metadata from documents
metadata = [i.metadata for i in documents]

# Create new Document objects with the summaries and the original metadata
text_summaries_with_metadata = []
for summary, meta in zip(text_summaries, metadata):
    # Create a copy of the metadata dictionary
    new_meta = copy.deepcopy(meta)
    
    # Add the suffix "(summary)" to the source in the copied metadata
    if 'source' in new_meta:
        new_meta['source'] += ' (summary)'
        
    text_summaries_with_metadata.append(Document(page_content=summary, metadata=new_meta))



In [16]:
documents

[Document(page_content="Things to Do in Montreal \nMontréal is a vibrant city with a rich cultural heritage and an array of activities to suit every \ninterest. \nOld Montreal  \nDebute  your exploration in the historic Old Montreal, where cobblestone streets and 17th-\ncentury architecture transport you back in time. Visit the stunning Notre-Dame Basilica, \nrenowned for its intricate interior and dramatic light shows. \nMontreal Museum of Fine Arts  \nFor a taste of the local arts scene, head to the Montreal Museum of Fine Arts, home to an \nimpressive collection of Canadian and international works. \nMont Royal  \nIf you're an outdoor enthusiast, Mont Royal o Ưers scenic hiking trails and panoramic views \nof the city. In the summer, the park becomes a hub for picnics and outdoor events, \nincluding the popular Tam-Tams festival, where locals gather to enjoy music and dance. \nMile End District   \nFor a modern twist, the Mile End district is a must-visit. Known for its bohemian vib

In [19]:
text_summaries_with_metadata

[Document(page_content='Montréal offers a variety of activities for all interests, reflecting its rich cultural heritage. Start in Old Montreal with its historic architecture and the Notre-Dame Basilica. Art enthusiasts should visit the Montreal Museum of Fine Arts. Outdoor lovers can explore Mont Royal for hiking and city views, and enjoy summer events like the Tam-Tams festival. The Mile End district offers a bohemian atmosphere with indie shops, galleries, and street art. In winter, the Underground City provides 30 kilometers of sheltered pathways connecting shops and metro stations. The city is also renowned for festivals like the Montreal International Jazz Festival and Just for Laughs.', metadata={'source': 'data\\montreal.pdf (summary)', 'page': 0, 'doc_id': '44840752-2ea4-51bd-9ece-fd7cf31454b9'}),
 Document(page_content="Montreal's culinary scene is diverse and exciting, featuring iconic establishments like Schwartz’s Deli for smoked meat sandwiches and La Banquise for poutine

In [17]:
idx = index_with_ids(text_summaries_with_metadata, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")
# Generate the list of (doc_id, document) tuples from the documents
doc_id_document_tuples = [(doc.metadata["doc_id"], doc) for doc in documents]

# Pass the list of tuples to retriever.docstore.mset
retriever.docstore.mset(doc_id_document_tuples)

In [18]:
idx

{'status': 'success',
 'ids': [{'key': '5505cf9b-a554-5490-8f02-350e72e79393', 'operation': 'INS'},
  {'key': 'd70df60f-f993-5fdf-8c43-c506e8171c89', 'operation': 'SKIP'}],
 'results': [{'num_added': 1,
   'num_updated': 0,
   'num_skipped': 1,
   'num_deleted': 1}]}

In [None]:
# retriever.vectorstore.add_documents(summary_docs)
# retriever.docstore.mset(list(zip(doc_ids, documents)))

### Generating Hypothetical Questions for Each Parent Chunk

In [20]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [21]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

question_chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 5 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        """Generate a list of exactly 5 hypothetical questions that the below document could be used to answer:\n\n{doc}
        seperate each question with a comma (,)
        """
    )
    | ChatOpenAI(max_retries=0, model="gpt-4o").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [22]:
hypothetical_questions = question_chain.batch(documents, {"max_concurrency": 5})

In [23]:
hypothetical_questions

[['What historical attractions can be found in Old Montreal?',
  'Where can I experience local and international art in Montreal?',
  'What are some outdoor activities available at Mont Royal?',
  'What makes the Mile End district a unique place to visit?',
  'How can visitors navigate Montreal during the winter?'],
 ['What is a must-visit deli in Montreal for smoked meat sandwiches?',
  'Where can one find the best poutine in Montreal?',
  'Which restaurant in Montreal is known for its innovative fine dining menu?',
  'What bistro in Montreal combines French culinary techniques with local ingredients?',
  'Where can you find freshly baked, wood-fired bagels in Montreal?']]

In [None]:
# from langchain.schema.document import Document

# hypothetical_docs = []
# for question_list, doc_id in zip(hypothetical_questions, doc_ids):
#     for question in question_list:
#         # Define your new metadata here
#         new_metadata = {"doc_id": doc_id}

#         # Create a new Document instance for each question
#         # The question itself is the page_content
#         doc = Document(page_content=question, metadata=new_metadata)

#         # Add the Document to the list
#         hypothetical_docs.append(doc)

In [24]:
documents

[Document(page_content="Things to Do in Montreal \nMontréal is a vibrant city with a rich cultural heritage and an array of activities to suit every \ninterest. \nOld Montreal  \nDebute  your exploration in the historic Old Montreal, where cobblestone streets and 17th-\ncentury architecture transport you back in time. Visit the stunning Notre-Dame Basilica, \nrenowned for its intricate interior and dramatic light shows. \nMontreal Museum of Fine Arts  \nFor a taste of the local arts scene, head to the Montreal Museum of Fine Arts, home to an \nimpressive collection of Canadian and international works. \nMont Royal  \nIf you're an outdoor enthusiast, Mont Royal o Ưers scenic hiking trails and panoramic views \nof the city. In the summer, the park becomes a hub for picnics and outdoor events, \nincluding the popular Tam-Tams festival, where locals gather to enjoy music and dance. \nMile End District   \nFor a modern twist, the Mile End district is a must-visit. Known for its bohemian vib

In [25]:
from langchain.schema.document import Document

hypothetical_docs = []
for original_doc, question_list in zip(documents, hypothetical_questions):
    doc_id = original_doc.metadata["doc_id"]
    page = original_doc.metadata.get("page")
    source = original_doc.metadata.get("source")

    for question in question_list:
        # Define your new metadata here with modified source
        new_metadata = {
            "doc_id": doc_id,
            "page": page,
            "source": f"{source}(question)"  # Add the suffix to the source
        }

        # Create a new Document instance for each question
        # The question itself is the page_content
        doc = Document(page_content=question, metadata=new_metadata)

        # Add the Document to the list
        hypothetical_docs.append(doc)

In [26]:
hypothetical_docs

[Document(page_content='What historical attractions can be found in Old Montreal?', metadata={'doc_id': '44840752-2ea4-51bd-9ece-fd7cf31454b9', 'page': 0, 'source': 'data\\montreal.pdf(question)'}),
 Document(page_content='Where can I experience local and international art in Montreal?', metadata={'doc_id': '44840752-2ea4-51bd-9ece-fd7cf31454b9', 'page': 0, 'source': 'data\\montreal.pdf(question)'}),
 Document(page_content='What are some outdoor activities available at Mont Royal?', metadata={'doc_id': '44840752-2ea4-51bd-9ece-fd7cf31454b9', 'page': 0, 'source': 'data\\montreal.pdf(question)'}),
 Document(page_content='What makes the Mile End district a unique place to visit?', metadata={'doc_id': '44840752-2ea4-51bd-9ece-fd7cf31454b9', 'page': 0, 'source': 'data\\montreal.pdf(question)'}),
 Document(page_content='How can visitors navigate Montreal during the winter?', metadata={'doc_id': '44840752-2ea4-51bd-9ece-fd7cf31454b9', 'page': 0, 'source': 'data\\montreal.pdf(question)'}),
 Do

In [27]:
idx = index_with_ids(hypothetical_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")

# Generate the list of (doc_id, document) tuples from the documents
doc_id_document_tuples = [(doc.metadata["doc_id"], doc) for doc in documents]

# Pass the list of tuples to retriever.docstore.mset
retriever.docstore.mset(doc_id_document_tuples)

In [28]:
idx

{'status': 'success',
 'ids': [{'key': 'ba823492-3737-5d19-861c-ef04d1f1476f', 'operation': 'INS'},
  {'key': 'f1452626-b31d-530b-8721-0059d4493aff', 'operation': 'INS'},
  {'key': '9b06b25b-cfaa-505d-8121-d5aeeaa95197', 'operation': 'INS'},
  {'key': '6b0b99f5-c96f-56f2-bc62-c94053ccece6', 'operation': 'INS'},
  {'key': '33c6f2f0-6ea8-55b4-86ad-08c97f87c238', 'operation': 'INS'},
  {'key': '61019741-a6dc-50d2-83ff-7572eefa155e', 'operation': 'INS'},
  {'key': '9332c277-dca9-5651-9f34-d98fc79ea0b7', 'operation': 'INS'},
  {'key': 'ab064cce-ac7f-5198-b69e-3b6712c30dbc', 'operation': 'INS'},
  {'key': '70c6744a-0c85-54f6-9691-9f79be122ab2', 'operation': 'INS'},
  {'key': '3b4ea6cc-6a21-5750-a87a-35510f887e43', 'operation': 'INS'}],
 'results': [{'num_added': 10,
   'num_updated': 0,
   'num_skipped': 0,
   'num_deleted': 0}]}

In [None]:
# retriever.vectorstore.add_documents(hypothetical_docs)
# retriever.docstore.mset(list(zip(doc_ids, documents)))

In [29]:
retriever.vectorstore.similarity_search("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

[Document(page_content='Which restaurant in Montreal is known for its innovative fine dining menu?', metadata={'page': 1, 'doc_id': '8ce6d14a-45f3-559f-a679-09f830fd5199', 'source': 'data\\montreal.pdf(question)'}),
 Document(page_content='What bistro in Montreal combines French culinary techniques with local ingredients?', metadata={'page': 1, 'doc_id': '8ce6d14a-45f3-559f-a679-09f830fd5199', 'source': 'data\\montreal.pdf(question)'}),
 Document(page_content='decades and are a staple of Montreal’s food culture. \nCandide and Damas  \nIn the mood for something more exotic? Try Candide, which serves contemporary dishes \nwith a focus on local and sustainable ingredients, or Damas, a high-end Syrian restaurant \nknown for its exquisite Middle Eastern cuisine. \nWith its diverse array of dining options, Montreal truly o Ưers something for every palate.', metadata={'page': 1, 'doc_id': '8ce6d14a-45f3-559f-a679-09f830fd5199', 'source': 'data\\montreal.pdf(smaller chunk)'}),
 Document(page_c

In [30]:
retriever.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

[Document(page_content='Dining in Montreal \nMontreal’s culinary scene is just as diverse and exciting as its activities. \nSchwartz’s Deli  \nStart your gastronomic journey with a visit to Schwartz’s Deli, an iconic establishment \nknown for its mouth-watering smoked meat sandwiches. \nLa Banquise  \nFor a true taste of Montreal, try poutine – a delectable dish of fries topped with cheese \ncurds and gravy – available at La Banquise, a local favorite that o Ưers numerous variations \nof this classic comfort food. \nToqué!  \nFor ﬁne dining, Toqué! is a standout choice, o Ưering an innovative menu that highlights \nseasonal ingredients and Quebecois ﬂavors. \nJoe Beef  \nFoodies will also appreciate Joe Beef, a beloved bistro that combines French culinary \ntechniques with hearty, local ingredients. \nFairmount Bagel and St-Viateur Bagel  \nIf you’re in the mood for something sweet, a stop at Fairmount Bagel or St-Viateur Bagel is \nessential. These bagel shops have been serving freshl

### Creating an LCEL Chain and Testing the Retriever

In [31]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Prompt template
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4o")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [32]:
chain.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

'For those interested in Middle Eastern cuisine, Damas is a high-end Syrian restaurant in Montreal known for its exquisite Middle Eastern cuisine.'

In [33]:
chain.invoke("Where can I find the best smoked meat sandwiches in Montreal?")

'You can find the best smoked meat sandwiches in Montreal at Schwartz’s Deli.'

In [34]:
chain.invoke("Where can I find the best food in Montreal?")

"Montreal offers a diverse array of dining options that cater to various tastes. Some standout places include:\n\n- **Schwartz’s Deli** for mouth-watering smoked meat sandwiches.\n- **La Banquise** for a true taste of Montreal's poutine.\n- **Toqué!** for fine dining with an innovative menu.\n- **Joe Beef** for a combination of French culinary techniques and local ingredients.\n- **Fairmount Bagel and St-Viateur Bagel** for freshly baked, wood-fired bagels.\n- **Candide** for contemporary dishes with a focus on local and sustainable ingredients.\n- **Damas** for exquisite Middle Eastern cuisine.\n\nThese establishments are highly recommended for experiencing the best food in Montreal."