In [1]:
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

True

### Understanding Store in LangChain

In [38]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# file_path = r"data\montreal.pdf"
file_path = r"data\toronto.pdf"

loader = PyPDFLoader(file_path=file_path)

# by default, we will split by pages with no text_splitter
documents = loader.load_and_split(text_splitter=None)
documents

[Document(page_content='Things to Do in Toronto \nPage 1: Introduction \nToronto, the capital of Ontario, is the largest city in Canada and a dynamic, cosmopolitan \nhub. Known for its towering skyline, bustling waterfront, and numerous cultural attractions, \nToronto o Ưers a wealth of experiences for every visitor. \nKey Attractions:  \n\uf0b7 CN Tower:  This iconic symbol of Toronto o Ưers panoramic views of the city. Don’t \nmiss the glass ﬂoor and the revolving restaurant at the top. \n\uf0b7 Royal Ontario Museum (ROM):  Canada’s largest museum of world cultures and \nnatural history is a must-visit. \n\uf0b7 Toronto Islands:  A group of small islands located just o Ư the city’s shore, o Ưering \nbeautiful beaches, picnic spots, and bike rentals.', metadata={'source': 'data\\toronto.pdf', 'page': 0}),
 Document(page_content='Page 2: Cultural Experiences \nToronto is a melting pot of cultures, and this is reﬂected in its neighborhoods and festivals. \nNeighborhoods:  \n\uf0b7 China

### Unsing the PostgresByteStore

In [3]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_postgres import PGVector
from database import COLLECTION_NAME, CONNECTION_STRING
from utils.store import PostgresByteStore
from langchain_postgres import PostgresSaver, PickleCheckpointSerializer
from utils.custom_sql_record_manager import CustomSQLRecordManager
from utils.index_with_ids import index_with_ids

embeddings = OpenAIEmbeddings()
vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=COLLECTION_NAME,
    connection=CONNECTION_STRING,
    use_jsonb=True,
)

store = PostgresByteStore(CONNECTION_STRING, COLLECTION_NAME)
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

# define record manager
namespace = f"pgvector/{COLLECTION_NAME}"
record_manager = CustomSQLRecordManager(
    namespace, db_url=CONNECTION_STRING
)
record_manager.create_schema()

retriever

MultiVectorRetriever(vectorstore=<langchain_postgres.vectorstores.PGVector object at 0x00000266345C4690>, docstore=<utils.store.PostgresByteStore object at 0x00000266346351D0>)

In [39]:
from utils.utils import generate_reproducible_id_by_content

# Add a reproducible unique doc_id to each document's metadata
for position, doc in enumerate(documents):
    doc.metadata["doc_id"] = generate_reproducible_id_by_content(doc.page_content, doc.metadata)

In [40]:
# Generate the list of (doc_id, document) tuples from the documents
doc_id_document_tuples = [(doc.metadata["doc_id"], doc) for doc in documents]

# Pass the list of tuples to retriever.docstore.mset
parent_docs_operations = retriever.docstore.conditional_mset(doc_id_document_tuples)

In [41]:
parent_docs_operations

[('eac9bbc7-a391-5931-a26c-11d9ee2402aa', 'SKIP'),
 ('4d722603-1c85-56ab-82f2-2d4dfdd3eb68', 'SKIP'),
 ('80e1944b-044e-5dc8-899f-7a941f1fa08b', 'SKIP'),
 ('a9cace25-2615-59b1-9669-ccd6656ac767', 'SKIP')]

In [42]:
from sqlalchemy import create_engine, Column, String, LargeBinary, select, delete, Table, MetaData
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.sql import cast
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Assume documents is a list of Document objects with 'doc_id' in their metadata
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

# List to store all sub-documents
all_sub_docs = []

# Database connection setup
engine = create_engine(CONNECTION_STRING)
Session = sessionmaker(bind=engine)
session = Session()

# Define table structure
metadata = MetaData()
langchain_pg_embedding = Table(
    'langchain_pg_embedding', metadata,
    Column('id', String, primary_key=True),
    Column('collection_id', String),
    Column('embedding', LargeBinary),
    Column('document', String),
    Column('cmetadata', JSONB)
)

# Iterate through the operations
for doc_id, operation in parent_docs_operations:
    if operation == 'SKIP':
        # Fetch records from langchain_pg_embedding table
        query = select(
            langchain_pg_embedding.c.id,
            langchain_pg_embedding.c.collection_id,
            langchain_pg_embedding.c.embedding,
            langchain_pg_embedding.c.document,
            langchain_pg_embedding.c.cmetadata
        ).where(
            (langchain_pg_embedding.c.cmetadata['doc_id'].astext == doc_id) &
            (langchain_pg_embedding.c.cmetadata['type'].astext == 'smaller chunk')
        )
        
        result = session.execute(query).fetchall()
        
        # Recreate sub-documents from fetched records
        for row in result:
            metadata = row.cmetadata
            sub_doc_content = row.document
            sub_doc = Document(page_content=sub_doc_content, metadata=metadata)
            all_sub_docs.append(sub_doc)
    else:
        # Retrieve the document from the docstore
        doc = retriever.docstore.get(doc_id)
        if doc:
            source = doc.metadata.get("source")  # Retrieve the source from the document's metadata
            sub_docs = child_text_splitter.split_documents([doc])
            for sub_doc in sub_docs:
                sub_doc.metadata["doc_id"] = doc_id  # Assign the same doc_id to each sub-document
                sub_doc.metadata["source"] = f"{source}(smaller chunk)"  # Add the suffix to the source
                sub_doc.metadata["type"] = "smaller chunk"
            all_sub_docs.extend(sub_docs)

# Close the session after use
session.close()

# The resulting sub-documents
all_sub_docs


[Document(page_content='Things to Do in Toronto \nPage 1: Introduction \nToronto, the capital of Ontario, is the largest city in Canada and a dynamic, cosmopolitan \nhub. Known for its towering skyline, bustling waterfront, and numerous cultural attractions, \nToronto o Ưers a wealth of experiences for every visitor. \nKey Attractions:  \n\uf0b7 CN Tower:  This iconic symbol of Toronto o Ưers panoramic views of the city. Don’t', metadata={'page': 0, 'type': 'smaller chunk', 'doc_id': 'eac9bbc7-a391-5931-a26c-11d9ee2402aa', 'source': 'data\\toronto.pdf(smaller chunk)'}),
 Document(page_content='Toronto o Ưers a wealth of experiences for every visitor. \nKey Attractions:  \n\uf0b7 CN Tower:  This iconic symbol of Toronto o Ưers panoramic views of the city. Don’t \nmiss the glass ﬂoor and the revolving restaurant at the top. \n\uf0b7 Royal Ontario Museum (ROM):  Canada’s largest museum of world cultures and \nnatural history is a must-visit.', metadata={'page': 0, 'type': 'smaller chunk',

In [43]:
idx = index_with_ids(all_sub_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")


In [44]:
idx

{'status': 'success',
 'ids': [{'key': 'de64f070-8219-5b39-8032-cc0f8b38129c', 'operation': 'SKIP'},
  {'key': '7e3089a7-17c8-5f81-85bf-85d2ffbd49e5', 'operation': 'SKIP'},
  {'key': 'f5d04788-e830-5f9e-a59b-6f868320a1aa', 'operation': 'SKIP'},
  {'key': '503d7916-e17c-560b-8f6d-fb43883ba7bf', 'operation': 'SKIP'},
  {'key': 'be89892a-f871-5170-9e26-ea9b31662aab', 'operation': 'SKIP'},
  {'key': '1bdda57a-675b-5bb3-8a3c-0484976f70f1', 'operation': 'SKIP'},
  {'key': '5a393ada-9b12-5354-b38f-295b353ce558', 'operation': 'SKIP'},
  {'key': '3862ca2f-e184-5d94-8ed9-1747dbecdfc0', 'operation': 'SKIP'},
  {'key': '55c17723-722d-53da-ad32-56d0c6e00111', 'operation': 'SKIP'},
  {'key': '664c65ff-79b2-551c-93e5-fba7ece620cb', 'operation': 'SKIP'}],
 'results': [{'num_added': 0,
   'num_updated': 0,
   'num_skipped': 10,
   'num_deleted': 0}]}

### Creating Summaries for Each Parent Chunk

In [45]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

prompt_text = """You are an assistant tasked with summarizing text. \
Directly summarize the following text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Initialize the Language Model (LLM)
model = ChatOpenAI(temperature=0, model="gpt-4o")

# Define the summary chain
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [46]:
from sqlalchemy import create_engine, Column, String, LargeBinary, select, Table, MetaData
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import JSONB
from langchain.schema.document import Document

# List to store all summary documents
summary_docs = []

# Database connection setup
engine = create_engine(CONNECTION_STRING)
Session = sessionmaker(bind=engine)
session = Session()

# Define table structure
metadata = MetaData()
langchain_pg_embedding = Table(
    'langchain_pg_embedding', metadata,
    Column('id', String, primary_key=True),
    Column('collection_id', String),
    Column('embedding', LargeBinary),
    Column('document', String),
    Column('cmetadata', JSONB)
)

# Collect parent chunks and associated document IDs for documents that are not SKIP
non_skip_docs = [(doc, doc_id) for doc, (doc_id, operation) in zip(documents, parent_docs_operations) if operation != 'SKIP']
parent_chunk = [doc.page_content for doc, _ in non_skip_docs]

# Generate summaries for the parent chunks that are not SKIP
text_summaries = summarize_chain.batch(parent_chunk, {"max_concurrency": 5})

# Create an iterator for the generated summaries
text_summaries_iter = iter(text_summaries)

# Iterate through the operations
for doc, (doc_id, operation) in zip(documents, parent_docs_operations):
    if operation == 'SKIP':
        # Fetch records from langchain_pg_embedding table
        query = select(
            langchain_pg_embedding.c.id,
            langchain_pg_embedding.c.collection_id,
            langchain_pg_embedding.c.embedding,
            langchain_pg_embedding.c.document,
            langchain_pg_embedding.c.cmetadata
        ).where(
            (langchain_pg_embedding.c.cmetadata['doc_id'].astext == doc_id) &
            (langchain_pg_embedding.c.cmetadata['type'].astext == 'summary')
        )
        
        result = session.execute(query).fetchall()
        
        # Recreate summary documents from fetched records
        for row in result:
            metadata = row.cmetadata
            summary_content = row.document
            summary_doc = Document(page_content=summary_content, metadata=metadata)
            summary_docs.append(summary_doc)
    else:
        # Retrieve the source and page from the document's metadata
        source = doc.metadata.get("source")
        page = doc.metadata.get("page")
        
        # Get the next generated summary
        summary_content = next(text_summaries_iter)
        
        # Create a summary document
        summary_doc = Document(page_content=summary_content, metadata={
            "doc_id": doc_id,
            "source": f"{source}(summary)",
            "page": page,
            "type": "summary"
        })
        summary_docs.append(summary_doc)

# Close the session after use
session.close()

# The resulting summary documents
summary_docs


[Document(page_content="Toronto, the capital of Ontario and Canada's largest city, is a vibrant and cosmopolitan hub known for its skyline, waterfront, and cultural attractions. Key attractions include the CN Tower with its panoramic views and revolving restaurant, the Royal Ontario Museum (ROM) which is Canada's largest museum of world cultures and natural history, and the Toronto Islands offering beaches, picnic spots, and bike rentals.", metadata={'page': 0, 'type': 'summary', 'doc_id': 'eac9bbc7-a391-5931-a26c-11d9ee2402aa', 'source': 'data\\toronto.pdf(summary)'}),
 Document(page_content="Toronto's cultural experiences are highlighted through its diverse neighborhoods and festivals. Key neighborhoods include Chinatown, known for its vibrant food scene; Kensington Market, offering vintage shops and international food stalls; and the Distillery District, featuring Victorian Industrial architecture with boutiques and art galleries. Notable festivals include Caribana, celebrating Cari

In [47]:
idx = index_with_ids(summary_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")


In [48]:
idx

{'status': 'success',
 'ids': [{'key': '951ade68-c534-535c-bba6-fe2ed366eefd', 'operation': 'SKIP'},
  {'key': 'fb7fc7df-1de3-5af3-9483-f5fdddfaf66e', 'operation': 'SKIP'},
  {'key': 'a9199eb6-da61-5344-a93d-e02d82f6ed34', 'operation': 'SKIP'},
  {'key': '1cd7a39b-13f5-59c4-a583-bf9c48c6c8f2', 'operation': 'SKIP'}],
 'results': [{'num_added': 0,
   'num_updated': 0,
   'num_skipped': 4,
   'num_deleted': 0}]}

### Generating Hypothetical Questions for Each Parent Chunk

In [49]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [50]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

question_chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 5 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        """Generate a list of exactly 5 hypothetical questions that the below document could be used to answer:\n\n{doc}
        seperate each question with a comma (,)
        """
    )
    | ChatOpenAI(max_retries=0, model="gpt-4o").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [51]:
from sqlalchemy import create_engine, Column, String, LargeBinary, select, Table, MetaData
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import JSONB
from langchain.schema.document import Document

# List to store all question documents
question_docs = []

# Database connection setup
engine = create_engine(CONNECTION_STRING)
Session = sessionmaker(bind=engine)
session = Session()

# Define table structure
metadata = MetaData()
langchain_pg_embedding = Table(
    'langchain_pg_embedding', metadata,
    Column('id', String, primary_key=True),
    Column('collection_id', String),
    Column('embedding', LargeBinary),
    Column('document', String),
    Column('cmetadata', JSONB)
)

# Collect parent documents and associated document IDs for documents that are not SKIP
non_skip_docs = [(doc, doc_id) for doc, (doc_id, operation) in zip(documents, parent_docs_operations) if operation != 'SKIP']
parent_documents = [doc for doc, _ in non_skip_docs]

# Generate hypothetical questions for the parent documents that are not SKIP
hypothetical_questions = question_chain.batch(parent_documents, {"max_concurrency": 5})

# Create an iterator for the generated questions
hypothetical_questions_iter = iter(hypothetical_questions)

# Iterate through the operations
for doc, (doc_id, operation) in zip(documents, parent_docs_operations):
    if operation == 'SKIP':
        # Fetch records from langchain_pg_embedding table
        query = select(
            langchain_pg_embedding.c.id,
            langchain_pg_embedding.c.collection_id,
            langchain_pg_embedding.c.embedding,
            langchain_pg_embedding.c.document,
            langchain_pg_embedding.c.cmetadata
        ).where(
            (langchain_pg_embedding.c.cmetadata['doc_id'].astext == doc_id) &
            (langchain_pg_embedding.c.cmetadata['type'].astext == 'question')
        )
        
        result = session.execute(query).fetchall()
        
        # Recreate question documents from fetched records
        for row in result:
            metadata = row.cmetadata
            question_content = row.document
            question_doc = Document(page_content=question_content, metadata=metadata)
            question_docs.append(question_doc)
    else:
        # Retrieve the source and page from the document's metadata
        source = doc.metadata.get("source")
        page = doc.metadata.get("page")
        
        # Get the list of generated questions for this document
        question_list = next(hypothetical_questions_iter)
        
        # Create a question document for each question in the list
        for question_content in question_list:
            # Define the metadata for the question document
            new_metadata = {
                "doc_id": doc_id,
                "source": f"{source}(question)",
                "page": page,
                "type": "question"
            }
            
            # Create the question document
            question_doc = Document(page_content=question_content, metadata=new_metadata)
            question_docs.append(question_doc)

# Close the session after use
session.close()

# The resulting question documents
question_docs


[Document(page_content='What are some of the top attractions to visit in Toronto?', metadata={'page': 0, 'type': 'question', 'doc_id': 'eac9bbc7-a391-5931-a26c-11d9ee2402aa', 'source': 'data\\toronto.pdf(question)'}),
 Document(page_content='Is the CN Tower worth visiting for a panoramic view of Toronto?', metadata={'page': 0, 'type': 'question', 'doc_id': 'eac9bbc7-a391-5931-a26c-11d9ee2402aa', 'source': 'data\\toronto.pdf(question)'}),
 Document(page_content='What can visitors do on the Toronto Islands?', metadata={'page': 0, 'type': 'question', 'doc_id': 'eac9bbc7-a391-5931-a26c-11d9ee2402aa', 'source': 'data\\toronto.pdf(question)'}),
 Document(page_content='Are there any notable museums in Toronto?', metadata={'page': 0, 'type': 'question', 'doc_id': 'eac9bbc7-a391-5931-a26c-11d9ee2402aa', 'source': 'data\\toronto.pdf(question)'}),
 Document(page_content='What makes Toronto a dynamic and cosmopolitan hub?', metadata={'page': 0, 'type': 'question', 'doc_id': 'eac9bbc7-a391-5931-a26

In [52]:
idx = index_with_ids(question_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")



In [53]:
idx

{'status': 'success',
 'ids': [{'key': '97ebcb97-f2a5-5da7-b1e1-2c66d9c763ef', 'operation': 'SKIP'},
  {'key': '80a1f94f-f85b-540d-a8ad-6483007e2a65', 'operation': 'SKIP'},
  {'key': '6086e0e0-7c2a-569e-8c79-c11f6e272540', 'operation': 'SKIP'},
  {'key': '50a1af88-0d1e-566c-b100-4c8390db89a3', 'operation': 'SKIP'},
  {'key': 'f74077eb-a538-57fa-a8cc-067468ba2652', 'operation': 'SKIP'},
  {'key': '33b30531-4744-5a34-aad0-e375b0e0b47e', 'operation': 'SKIP'},
  {'key': '34d85a51-ff59-5a51-ac21-fdedc576d740', 'operation': 'SKIP'},
  {'key': '51d7af49-b3cf-5ef0-862d-b8e21d108c4e', 'operation': 'SKIP'},
  {'key': '0600d956-c4f0-51e6-99f4-174fdde4efe1', 'operation': 'SKIP'},
  {'key': 'd06b267c-9920-5637-9ae4-b214fcd040f0', 'operation': 'SKIP'},
  {'key': 'ae0d77ee-12f9-5c17-b4b8-de28d261e3eb', 'operation': 'SKIP'},
  {'key': '442bc5ab-b40f-5c1f-b95e-aab596a0d3b2', 'operation': 'SKIP'},
  {'key': '0fb59fc5-23c0-5b9c-92cb-28cdc0f52c37', 'operation': 'SKIP'},
  {'key': '71173b29-71ed-5c9e-9958-

In [None]:
retriever.vectorstore.similarity_search("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

In [None]:
retriever.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

### Creating an LCEL Chain and Testing the Retriever

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Prompt template
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4o")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
chain.invoke("What dining options are available in Montreal for those interested in Middle Eastern cuisine?")

In [None]:
chain.invoke("Where can I find the best smoked meat sandwiches in Montreal?")

In [None]:
chain.invoke("Where can I find the best food in Montreal?")