In [1]:
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

True

### Understanding Store in LangChain

In [13]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = r"data\toronto.pdf"

loader = PyPDFLoader(file_path=file_path)

# by default, we will split by pages with no text_splitter
documents = loader.load_and_split(text_splitter=None)
documents

[Document(page_content='Things to Do in Toronto \nPage 1: Introduction \nToronto, the capital of Ontario, is the largest city in Canada and a dynamic, cosmopolitan \nhub. Known for its towering skyline, bustling waterfront, and numerous cultural attractions, \nToronto o Ưers a wealth of experiences for every visitor.  \nKey Attractions:  \n\uf0b7 CN Tower:  This iconic symbol of Toronto o Ưers panoramic views of the city. Don’t \nmiss the glass ﬂoor and the revolving restaurant at the top. \n\uf0b7 Royal Ontario Museum (ROM):  Canada’s largest museum of world cultures and \nnatural history is a must-visit. \n\uf0b7 Toronto Islands:  A group of small islands located just o Ư the city’s shore, o Ưering \nbeautiful beaches, picnic spots, and bike rentals.', metadata={'source': 'data\\toronto.pdf', 'page': 0}),
 Document(page_content='Page 2: Cultural Experiences \nToronto is a melting pot of cultures, and this is reﬂected in its neighborhoods and festivals. \nNeighborhoods:  \n\uf0b7 Chin

### Unsing the PostgresByteStore

In [3]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_postgres import PGVector
from database import COLLECTION_NAME, CONNECTION_STRING
from utils.store import PostgresByteStore
from langchain_postgres import PostgresSaver, PickleCheckpointSerializer
from utils.custom_sql_record_manager import CustomSQLRecordManager
from utils.index_with_ids import index_with_ids


embeddings = OpenAIEmbeddings()
vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=COLLECTION_NAME,
    connection=CONNECTION_STRING,
    use_jsonb=True,
)

store = PostgresByteStore(CONNECTION_STRING, COLLECTION_NAME)
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

# define record manager
namespace = f"pgvector/{COLLECTION_NAME}"
record_manager = CustomSQLRecordManager(
    namespace, db_url=CONNECTION_STRING
)
record_manager.create_schema()

retriever

MultiVectorRetriever(vectorstore=<langchain_postgres.vectorstores.PGVector object at 0x0000025FEF02AA90>, docstore=<utils.store.PostgresByteStore object at 0x0000025FEF0F7350>)

In [14]:
from utils.utils import generate_reproducible_id_by_content

# Add a reproducible unique doc_id to each document's metadata
for position, doc in enumerate(documents):
    doc.metadata["doc_id"] = generate_reproducible_id_by_content(doc.page_content, doc.metadata)

In [15]:
documents

[Document(page_content='Things to Do in Toronto \nPage 1: Introduction \nToronto, the capital of Ontario, is the largest city in Canada and a dynamic, cosmopolitan \nhub. Known for its towering skyline, bustling waterfront, and numerous cultural attractions, \nToronto o Ưers a wealth of experiences for every visitor.  \nKey Attractions:  \n\uf0b7 CN Tower:  This iconic symbol of Toronto o Ưers panoramic views of the city. Don’t \nmiss the glass ﬂoor and the revolving restaurant at the top. \n\uf0b7 Royal Ontario Museum (ROM):  Canada’s largest museum of world cultures and \nnatural history is a must-visit. \n\uf0b7 Toronto Islands:  A group of small islands located just o Ư the city’s shore, o Ưering \nbeautiful beaches, picnic spots, and bike rentals.', metadata={'source': 'data\\toronto.pdf', 'page': 0, 'doc_id': '70a01880-8f42-5b40-9fb0-85852ecc0f1d'}),
 Document(page_content='Page 2: Cultural Experiences \nToronto is a melting pot of cultures, and this is reﬂected in its neighborho

In [16]:
# Generate the list of (doc_id, document) tuples from the documents
doc_id_document_tuples = [(doc.metadata["doc_id"], doc, doc.metadata["source"]) for doc in documents]

# Pass the list of tuples to retriever.docstore.conditional_mset
parent_docs_operations = retriever.docstore.conditional_mset(doc_id_document_tuples)

In [17]:
parent_docs_operations

[('756d43c5-71a0-5ce9-8c77-4f23e8c7721f', 'DEL'),
 ('70a01880-8f42-5b40-9fb0-85852ecc0f1d', 'INS'),
 ('4d722603-1c85-56ab-82f2-2d4dfdd3eb68', 'SKIP'),
 ('80e1944b-044e-5dc8-899f-7a941f1fa08b', 'SKIP'),
 ('a9cace25-2615-59b1-9669-ccd6656ac767', 'SKIP')]

### Creating Smaller Documents

In [None]:
from sqlalchemy import create_engine, Column, String, LargeBinary, select, Table, MetaData
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import JSONB
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

separators = ["\n\n", "\n", ".", "?", "!"]

# Initialize the RecursiveCharacterTextSplitter with fixed parameters
child_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    separators=separators
)

# List to store all sub-documents
all_sub_docs = []

# Database connection setup
engine = create_engine(CONNECTION_STRING)
Session = sessionmaker(bind=engine)
session = Session()

# Define table structure
metadata = MetaData()
langchain_pg_embedding = Table(
    'langchain_pg_embedding', metadata,
    Column('id', String, primary_key=True),
    Column('collection_id', String),
    Column('embedding', LargeBinary),
    Column('document', String),
    Column('cmetadata', JSONB)
)

# Sort the parent documents operations to ensure deterministic processing order
parent_docs_operations = sorted(parent_docs_operations, key=lambda x: x[0])

# Iterate through the operations
for doc_id, operation in parent_docs_operations:
    if operation == 'SKIP':
        # Fetch records from langchain_pg_embedding table for SKIP documents
        query = select(
            langchain_pg_embedding.c.id,
            langchain_pg_embedding.c.collection_id,
            langchain_pg_embedding.c.embedding,
            langchain_pg_embedding.c.document,
            langchain_pg_embedding.c.cmetadata
        ).where(
            (langchain_pg_embedding.c.cmetadata['doc_id'].astext == doc_id) &
            (langchain_pg_embedding.c.cmetadata['type'].astext == 'smaller chunk')
        ).order_by(langchain_pg_embedding.c.id)  # Ensure fixed order
        
        result = session.execute(query).fetchall()
        
        # Recreate sub-documents from fetched records
        for row in result:
            metadata = row.cmetadata
            sub_doc_content = row.document
            sub_doc = Document(page_content=sub_doc_content, metadata=metadata)
            all_sub_docs.append(sub_doc)
    else:
        # Retrieve the document from the docstore for non-SKIP documents
        doc = retriever.docstore.get(doc_id)
        if doc:
            source = doc.metadata.get("source")  # Retrieve the source from the document's metadata
            sub_docs = child_text_splitter.split_documents([doc])
            # Ensure fixed order for sub-documents
            sub_docs = sorted(sub_docs, key=lambda x: x.page_content)
            for sub_doc in sub_docs:
                sub_doc.metadata["doc_id"] = doc_id  # Assign the same doc_id to each sub-document
                sub_doc.metadata["source"] = f"{source}(smaller chunk)"  # Add the suffix to the source
                sub_doc.metadata["type"] = "smaller chunk"
            all_sub_docs.extend(sub_docs)

# Close the session after use
session.close()

# The resulting sub-documents
all_sub_docs

In [None]:
len(all_sub_docs)

In [None]:
idx = index_with_ids(all_sub_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")


In [None]:
idx

### Creating Summaries for Each Parent Chunk

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

prompt_text = """You are an assistant tasked with summarizing and rephrasing text to maintain the original intent of the document. Rephrase the following text chunk in its original language, ensuring to preserve the original meaning and context, even if it comes from a chart or table: \n\n{element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Initialize the Language Model (LLM)
model = ChatOpenAI(temperature=0, model="gpt-4o")

# Define the summary chain
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [None]:
from sqlalchemy import create_engine, Column, String, LargeBinary, select, Table, MetaData
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import JSONB
from langchain.schema.document import Document

# List to store all summary documents
summary_docs = []

# Database connection setup
engine = create_engine(CONNECTION_STRING)
Session = sessionmaker(bind=engine)
session = Session()

# Define table structure
metadata = MetaData()
langchain_pg_embedding = Table(
    'langchain_pg_embedding', metadata,
    Column('id', String, primary_key=True),
    Column('collection_id', String),
    Column('embedding', LargeBinary),
    Column('document', String),
    Column('cmetadata', JSONB)
)

# Create a dictionary to map doc_id to documents
documents_dict = {doc.metadata['doc_id']: doc for doc in documents}

# Collect parent chunks and associated document IDs for documents that are not SKIP and exist in documents_dict
non_skip_docs = [(documents_dict[doc_id], doc_id) for doc_id, operation in parent_docs_operations if operation != 'SKIP' and doc_id in documents_dict]
skip_doc_ids = [doc_id for doc_id, operation in parent_docs_operations if operation == 'SKIP']


# Generate summaries for the parent chunks that are not SKIP
parent_chunk = [doc.page_content for doc, _ in non_skip_docs]
text_summaries = summarize_chain.batch(parent_chunk, {"max_concurrency": 5})
text_summaries_iter = iter(text_summaries)

# Dictionary to store summaries temporarily
temp_summary_docs = {}

# Process non-SKIP documents and store their summaries
for doc, doc_id in non_skip_docs:
    source = doc.metadata.get("source")
    page = doc.metadata.get("page")
    summary_content = next(text_summaries_iter)
    summary_doc = Document(page_content=summary_content, metadata={
        "doc_id": doc_id,
        "source": f"{source}(summary)",
        "page": page,
        "type": "summary"
    })
    temp_summary_docs[doc_id] = summary_doc


# Process SKIP documents and store their summaries
for doc_id in skip_doc_ids:
    query = select(
        langchain_pg_embedding.c.id,
        langchain_pg_embedding.c.collection_id,
        langchain_pg_embedding.c.embedding,
        langchain_pg_embedding.c.document,
        langchain_pg_embedding.c.cmetadata
    ).where(
        (langchain_pg_embedding.c.cmetadata['doc_id'].astext == doc_id) &
        (langchain_pg_embedding.c.cmetadata['type'].astext == 'summary')
    )


    result = session.execute(query).fetchall()

    if not result:
        print(f"No result found for SKIP doc_id {doc_id}")
    else:
        for row in result:
            metadata = row.cmetadata
            summary_content = row.document
            summary_doc = Document(page_content=summary_content, metadata=metadata)
            temp_summary_docs[doc_id] = summary_doc


# Combine the summaries into the final summary_docs list
for doc in documents:
    doc_id = doc.metadata['doc_id']
    if doc_id in temp_summary_docs:
        summary_docs.append(temp_summary_docs[doc_id])
    else:
        # Handle the case where no summary was found or generated
        print(f"No summary found for document ID {doc_id}")

# Close the session after use
session.close()




In [None]:
summary_docs


In [None]:
idx = index_with_ids(summary_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")


In [None]:
idx

### Generating Hypothetical Questions for Each Parent Chunk

In [None]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [None]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

question_chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 5 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
        """Generate a list of exactly 5 hypothetical questions that the below document could be used to answer, in the original language of the text:\n\n{doc}
Separate each question with a comma (,)
        """
    )
    | ChatOpenAI(max_retries=0, model="gpt-4o").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [None]:
from sqlalchemy import create_engine, Column, String, LargeBinary, select, Table, MetaData
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import JSONB
from langchain.schema.document import Document

# List to store all question documents
question_docs = []

# Database connection setup
engine = create_engine(CONNECTION_STRING)
Session = sessionmaker(bind=engine)
session = Session()

# Define table structure
metadata = MetaData()
langchain_pg_embedding = Table(
    'langchain_pg_embedding', metadata,
    Column('id', String, primary_key=True),
    Column('collection_id', String),
    Column('embedding', LargeBinary),
    Column('document', String),
    Column('cmetadata', JSONB)
)

# Create a dictionary to map doc_id to documents
documents_dict = {doc.metadata['doc_id']: doc for doc in documents}

# Separate non-SKIP and SKIP document IDs
non_skip_docs = [(documents_dict[doc_id], doc_id) for doc_id, operation in parent_docs_operations if operation != 'SKIP' and doc_id in documents_dict]
skip_doc_ids = [doc_id for doc_id, operation in parent_docs_operations if operation == 'SKIP']

# Generate hypothetical questions for the parent documents that are not SKIP
parent_documents = [doc for doc, _ in non_skip_docs]
hypothetical_questions = question_chain.batch(parent_documents, {"max_concurrency": 5})
hypothetical_questions_iter = iter(hypothetical_questions)

# Dictionary to store questions temporarily
temp_question_docs = {}

# Process non-SKIP documents and store their questions
for doc, doc_id in non_skip_docs:
    source = doc.metadata.get("source")
    page = doc.metadata.get("page")
    question_list = next(hypothetical_questions_iter)
    
    # Ensure there are exactly 5 questions for each document
    if len(question_list) < 5:
        question_list = question_list + [""] * (5 - len(question_list))  # Pad with empty strings if fewer than 5
    
    for question_content in question_list[:5]:
        question_doc = Document(page_content=question_content, metadata={
            "doc_id": doc_id,
            "source": f"{source}(question)",
            "page": page,
            "type": "question"
        })
        if doc_id not in temp_question_docs:
            temp_question_docs[doc_id] = []
        temp_question_docs[doc_id].append(question_doc)

# Process SKIP documents and store their questions
for doc_id in skip_doc_ids:
    query = select(
        langchain_pg_embedding.c.id,
        langchain_pg_embedding.c.collection_id,
        langchain_pg_embedding.c.embedding,
        langchain_pg_embedding.c.document,
        langchain_pg_embedding.c.cmetadata
    ).where(
        (langchain_pg_embedding.c.cmetadata['doc_id'].astext == doc_id) &
        (langchain_pg_embedding.c.cmetadata['type'].astext == 'question')
    )

    result = session.execute(query).fetchall()

    if result:
        questions = []
        for row in result:
            metadata = row.cmetadata
            question_content = row.document
            question_doc = Document(page_content=question_content, metadata=metadata)
            questions.append(question_doc)
        
        # Ensure there are exactly 5 questions for each document
        if len(questions) < 5:
            questions = questions + [Document(page_content="", metadata={
                "doc_id": doc_id,
                "source": f"{documents_dict[doc_id].metadata.get('source')}(question)",
                "page": documents_dict[doc_id].metadata.get("page"),
                "type": "question"
            }) for _ in range(5 - len(questions))]  # Pad with empty documents if fewer than 5
        
        temp_question_docs[doc_id] = questions[:5]

# Combine the questions into the final question_docs list
for doc in documents:
    doc_id = doc.metadata['doc_id']
    if doc_id in temp_question_docs:
        question_docs.extend(temp_question_docs[doc_id])

# Close the session after use
session.close()

# The resulting question documents
question_docs


In [None]:
idx = index_with_ids(question_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")



In [None]:
idx

### Creating an LCEL Chain and Testing the Retriever

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Prompt template
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4o")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
chain.invoke("Où doit-on donner son avis ?")


In [None]:
chain.invoke("Quelle est l'adresse courriel mentionnée dans le document ?")