In [76]:
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

True

### Understanding Stores in LangChain

In [113]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = r"data\montreal.pdf"

loader = PyPDFLoader(file_path=file_path)

# by default, we will split by pages with no text_splitter
documents = loader.load_and_split(text_splitter=None)
documents

[Document(page_content="Things to Do in Montreal \nMontreal is a vibrant city with a rich cultural heritage and an array of activities to suit every \ninterest. \nOld Montreal  \nDebute  your exploration in the historic Old Montreal, where cobblestone streets and 17th-\ncentury architecture transport you back in time. Visit the stunning Notre-Dame Basilica, \nrenowned for its intricate interior and dramatic light shows. \nMontreal Museum of Fine Arts  \nFor a taste of the local arts scene, head to the Montreal Museum of Fine Arts, home to an \nimpressive collection of Canadian and international works. \nMont Royal  \nIf you're an outdoor enthusiast, Mont Royal o Ưers scenic hiking trails and panoramic views \nof the city. In the summer, the park becomes a hub for picnics and outdoor events, \nincluding the popular Tam-Tams festival, where locals gather to enjoy music and dance. \nMile End District   \nFor a modern twist, the Mile End district is a must-visit. Known for its bohemian vib

### Unsing the PostgresByteStore

In [78]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_postgres import PGVector
from database import COLLECTION_NAME, CONNECTION_STRING
from utils.store import PostgresByteStore
from langchain_postgres import PostgresSaver, PickleCheckpointSerializer
from langchain.indexes import SQLRecordManager, index

embeddings = OpenAIEmbeddings()
vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=COLLECTION_NAME,
    connection=CONNECTION_STRING,
    use_jsonb=True,
)

store = PostgresByteStore(CONNECTION_STRING, COLLECTION_NAME)
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

# define record manager
namespace = f"pgvector/{COLLECTION_NAME}"
record_manager = SQLRecordManager(
    namespace, db_url=CONNECTION_STRING
)
record_manager.create_schema()

retriever

MultiVectorRetriever(vectorstore=<langchain_postgres.vectorstores.PGVector object at 0x000002E90CA66F10>, docstore=<utils.store.PostgresByteStore object at 0x000002E90CA65D90>)

In [114]:
import uuid

# Add a unique doc_id to each document's metadata
for doc in documents:
    doc.metadata["doc_id"] = str(uuid.uuid4())

In [115]:
documents

[Document(page_content="Things to Do in Montreal \nMontreal is a vibrant city with a rich cultural heritage and an array of activities to suit every \ninterest. \nOld Montreal  \nDebute  your exploration in the historic Old Montreal, where cobblestone streets and 17th-\ncentury architecture transport you back in time. Visit the stunning Notre-Dame Basilica, \nrenowned for its intricate interior and dramatic light shows. \nMontreal Museum of Fine Arts  \nFor a taste of the local arts scene, head to the Montreal Museum of Fine Arts, home to an \nimpressive collection of Canadian and international works. \nMont Royal  \nIf you're an outdoor enthusiast, Mont Royal o Ưers scenic hiking trails and panoramic views \nof the city. In the summer, the park becomes a hub for picnics and outdoor events, \nincluding the popular Tam-Tams festival, where locals gather to enjoy music and dance. \nMile End District   \nFor a modern twist, the Mile End district is a must-visit. Known for its bohemian vib

In [116]:
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Assume documents is a list of Document objects with 'doc_id' in their metadata
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

all_sub_docs = []
for doc in documents:
    doc_id = doc.metadata.get("doc_id")  # Retrieve the doc_id from the document's metadata
    sub_docs = child_text_splitter.split_documents([doc])
    for sub_doc in sub_docs:
        sub_doc.metadata["doc_id"] = doc_id  # Assign the same doc_id to each sub-document
    all_sub_docs.extend(sub_docs)

# Display the resulting sub-documents
all_sub_docs


[Document(page_content='Things to Do in Montreal \nMontreal is a vibrant city with a rich cultural heritage and an array of activities to suit every \ninterest. \nOld Montreal  \nDebute  your exploration in the historic Old Montreal, where cobblestone streets and 17th-\ncentury architecture transport you back in time. Visit the stunning Notre-Dame Basilica, \nrenowned for its intricate interior and dramatic light shows.', metadata={'source': 'data\\montreal.pdf', 'page': 0, 'doc_id': '20ce5d80-60c9-4173-b157-39db3ffa6ea4'}),
 Document(page_content='century architecture transport you back in time. Visit the stunning Notre-Dame Basilica, \nrenowned for its intricate interior and dramatic light shows. \nMontreal Museum of Fine Arts  \nFor a taste of the local arts scene, head to the Montreal Museum of Fine Arts, home to an \nimpressive collection of Canadian and international works. \nMont Royal', metadata={'source': 'data\\montreal.pdf', 'page': 0, 'doc_id': '20ce5d80-60c9-4173-b157-39db

In [117]:
idx = index(all_sub_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")
# Generate the list of (doc_id, document) tuples from the documents
doc_id_document_tuples = [(doc.metadata["doc_id"], doc) for doc in documents]

# Pass the list of tuples to retriever.docstore.mset
retriever.docstore.mset(doc_id_document_tuples)

In [118]:
idx

{'num_added': 13, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 13}

### Creating Summaries for Each Parent Chunk

In [83]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

prompt_text = """You are an assistant tasked with summarizing text. \
Directly summarize the following text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Initialize the Language Model (LLM)
model = ChatOpenAI(temperature=0, model="gpt-4o")

# Define the summary chain
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [84]:
parent_chunk = [i.page_content for i in documents]
text_summaries = summarize_chain.batch(parent_chunk, {"max_concurrency": 5})

In [85]:
from langchain.schema.document import Document

# Extract metadata from documents
metadata = [i.metadata for i in documents]

# Create new Document objects with the summaries and the original metadata
text_summaries_with_metadata = []
for summary, meta in zip(text_summaries, metadata):
    # Add the suffix "(summary)" to the source in metadata
    if 'source' in meta:
        meta['source'] += ' (summary)'
    text_summaries_with_metadata.append(Document(page_content=summary, metadata=meta))


In [94]:
text_summaries_with_metadata

[Document(page_content='Montréal offers a variety of activities for all interests, reflecting its rich cultural heritage. Start in Old Montreal with its historic architecture and the Notre-Dame Basilica. Art enthusiasts should visit the Montreal Museum of Fine Arts. Outdoor lovers can explore Mont Royal for hiking and city views, and enjoy summer events like the Tam-Tams festival. The Mile End district offers a bohemian atmosphere with indie shops, galleries, and street art. In winter, the Underground City provides 30 kilometers of sheltered pathways connecting shops and metro stations. The city is also renowned for festivals like the Montreal International Jazz Festival and Just for Laughs.', metadata={'source': 'data\\montreal.pdf (summary)', 'page': 0, 'doc_id': '7b6bd10e-acaf-4aa4-ba7f-8b1bb14763d3'}),
 Document(page_content="Montreal's culinary scene is diverse and exciting, offering a range of dining experiences. Start with Schwartz’s Deli for iconic smoked meat sandwiches, and t

In [95]:
idx = index(text_summaries_with_metadata, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")
# Generate the list of (doc_id, document) tuples from the documents
doc_id_document_tuples = [(doc.metadata["doc_id"], doc) for doc in documents]

# Pass the list of tuples to retriever.docstore.mset
retriever.docstore.mset(doc_id_document_tuples)

In [96]:
idx

{'num_added': 0, 'num_updated': 0, 'num_skipped': 2, 'num_deleted': 0}

### Generating Hypothetical Questions for Each Parent Chunk

In [97]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [98]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

question_chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 5 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        """Generate a list of exactly 5 hypothetical questions that the below document could be used to answer:\n\n{doc}
        seperate each question with a comma (,)
        """
    )
    | ChatOpenAI(max_retries=0, model="gpt-4o").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [99]:
hypothetical_questions = question_chain.batch(documents, {"max_concurrency": 5})

In [100]:
hypothetical_questions

[['What historical sites can I visit in Old Montreal?',
  'What art-related attractions are available in Montreal?',
  'Where can I go for outdoor activities in Montreal?',
  'What can I do in the Mile End district?',
  'How can I navigate Montreal during the winter?'],
 ['What is a must-try iconic dish in Montreal and where can I find it?',
  'Where can I experience fine dining in Montreal?',
  'Which restaurants in Montreal are known for combining local ingredients with international culinary techniques?',
  'Where can I find the best poutine in Montreal?',
  'Which establishments in Montreal are famous for their bagels?']]

In [103]:
from langchain.schema.document import Document

hypothetical_docs = []
for original_doc, question_list in zip(documents, hypothetical_questions):
    doc_id = original_doc.metadata["doc_id"]
    page = original_doc.metadata.get("page")
    source = original_doc.metadata.get("source")

    for question in question_list:
        # Define your new metadata here with modified source
        new_metadata = {
            "doc_id": doc_id,
            "page": page,
            "source": f"{source}(question)"  # Add the suffix to the source
        }

        # Create a new Document instance for each question
        # The question itself is the page_content
        doc = Document(page_content=question, metadata=new_metadata)

        # Add the Document to the list
        hypothetical_docs.append(doc)

In [104]:
hypothetical_docs

[Document(page_content='What historical sites can I visit in Old Montreal?', metadata={'doc_id': '7b6bd10e-acaf-4aa4-ba7f-8b1bb14763d3', 'page': 0, 'source': 'data\\montreal.pdf (summary)(question)'}),
 Document(page_content='What art-related attractions are available in Montreal?', metadata={'doc_id': '7b6bd10e-acaf-4aa4-ba7f-8b1bb14763d3', 'page': 0, 'source': 'data\\montreal.pdf (summary)(question)'}),
 Document(page_content='Where can I go for outdoor activities in Montreal?', metadata={'doc_id': '7b6bd10e-acaf-4aa4-ba7f-8b1bb14763d3', 'page': 0, 'source': 'data\\montreal.pdf (summary)(question)'}),
 Document(page_content='What can I do in the Mile End district?', metadata={'doc_id': '7b6bd10e-acaf-4aa4-ba7f-8b1bb14763d3', 'page': 0, 'source': 'data\\montreal.pdf (summary)(question)'}),
 Document(page_content='How can I navigate Montreal during the winter?', metadata={'doc_id': '7b6bd10e-acaf-4aa4-ba7f-8b1bb14763d3', 'page': 0, 'source': 'data\\montreal.pdf (summary)(question)'}),


In [105]:
idx = index(hypothetical_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")

# Generate the list of (doc_id, document) tuples from the documents
doc_id_document_tuples = [(doc.metadata["doc_id"], doc) for doc in documents]

# Pass the list of tuples to retriever.docstore.mset
retriever.docstore.mset(doc_id_document_tuples)

In [106]:
idx

{'num_added': 10, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [107]:
retriever.vectorstore.similarity_search("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

[Document(page_content='Where can I experience fine dining in Montreal?', metadata={'page': 1, 'doc_id': '402364d6-11f4-440e-9156-97643036c7d5', 'source': 'data\\montreal.pdf (summary)(question)'}),
 Document(page_content='Which restaurants in Montreal are known for combining local ingredients with international culinary techniques?', metadata={'page': 1, 'doc_id': '402364d6-11f4-440e-9156-97643036c7d5', 'source': 'data\\montreal.pdf (summary)(question)'}),
 Document(page_content='decades and are a staple of Montreal’s food culture. \nCandide and Damas  \nIn the mood for something more exotic? Try Candide, which serves contemporary dishes \nwith a focus on local and sustainable ingredients, or Damas, a high-end Syrian restaurant \nknown for its exquisite Middle Eastern cuisine. \nWith its diverse array of dining options, Montreal truly o Ưers something for every palate.', metadata={'page': 1, 'doc_id': '402364d6-11f4-440e-9156-97643036c7d5', 'source': 'data\\montreal.pdf'}),
 Document(

In [108]:
retriever.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

[Document(page_content='Dining in Montreal \nMontreal’s culinary scene is just as diverse and exciting as its activities. \nSchwartz’s Deli  \nStart your gastronomic journey with a visit to Schwartz’s Deli, an iconic establishment \nknown for its mouth-watering smoked meat sandwiches. \nLa Banquise  \nFor a true taste of Montreal, try poutine – a delectable dish of fries topped with cheese \ncurds and gravy – available at La Banquise, a local favorite that o Ưers numerous variations \nof this classic comfort food. \nToqué!  \nFor ﬁne dining, Toqué! is a standout choice, o Ưering an innovative menu that highlights \nseasonal ingredients and Quebecois ﬂavors. \nJoe Beef  \nFoodies will also appreciate Joe Beef, a beloved bistro that combines French culinary \ntechniques with hearty, local ingredients. \nFairmount Bagel and St-Viateur Bagel  \nIf you’re in the mood for something sweet, a stop at Fairmount Bagel or St-Viateur Bagel is \nessential. These bagel shops have been serving freshl

### Creating an LCEL Chain and Testing the Retriever

In [109]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Prompt template
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4o")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [110]:
chain.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

'For those interested in Middle Eastern cuisine, Damas is a high-end Syrian restaurant in Montreal known for its exquisite Middle Eastern cuisine.'

In [111]:
chain.invoke("Where can I find the best smoked meat sandwiches in Montreal?")

'You can find the best smoked meat sandwiches in Montreal at Schwartz’s Deli.'

In [112]:
chain.invoke("Where can I find the best food in Montreal?")

'Montreal offers a diverse array of dining options that cater to various tastes. Some standout places include Schwartz’s Deli for smoked meat sandwiches, La Banquise for poutine, Toqué! for fine dining with seasonal ingredients, Joe Beef for a mix of French techniques and local ingredients, Fairmount Bagel and St-Viateur Bagel for freshly baked bagels, Candide for contemporary dishes with local and sustainable ingredients, and Damas for high-end Syrian cuisine.'