In [1]:
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

True

### Understanding Store in LangChain

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = r"data\toronto.pdf"

loader = PyPDFLoader(file_path=file_path)

# by default, we will split by pages with no text_splitter
documents = loader.load_and_split(text_splitter=None)
documents

[Document(page_content='Things to Do in Toronto \nPage 1: Introduction \nToronto, the capital of Ontario, is the largest city in Canada and a dynamic, cosmopolitan \nhub. Known for its towering skyline, bustling waterfront, and numerous cultural attractions, \nToronto o Ưers a wealth of experiences for every visitor. Very Nice \nKey Attractions:  \n\uf0b7 CN Tower:  This iconic symbol of Toronto o Ưers panoramic views of the city. Don’t \nmiss the glass ﬂoor and the revolving restaurant at the top. \n\uf0b7 Royal Ontario Museum (ROM):  Canada’s largest museum of world cultures and \nnatural history is a must-visit. \n\uf0b7 Toronto Islands:  A group of small islands located just o Ư the city’s shore, o Ưering \nbeautiful beaches, picnic spots, and bike rentals.', metadata={'source': 'data\\toronto.pdf', 'page': 0}),
 Document(page_content='Page 2: Cultural Experiences \nToronto is a melting pot of cultures, and this is reﬂected in its neighborhoods and festivals. \nNeighborhoods:  \n\u

### Unsing the PostgresByteStore

In [3]:
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_postgres import PGVector
from database import COLLECTION_NAME, CONNECTION_STRING
from utils.store import PostgresByteStore
from langchain_postgres import PostgresSaver, PickleCheckpointSerializer
from utils.custom_sql_record_manager import CustomSQLRecordManager
from utils.index_with_ids import index_with_ids


embeddings = OpenAIEmbeddings()
vectorstore = PGVector(
    embeddings=embeddings,
    collection_name=COLLECTION_NAME,
    connection=CONNECTION_STRING,
    use_jsonb=True,
)

store = PostgresByteStore(CONNECTION_STRING, COLLECTION_NAME)
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore, 
    docstore=store, 
    id_key=id_key,
)

# define record manager
namespace = f"pgvector/{COLLECTION_NAME}"
record_manager = CustomSQLRecordManager(
    namespace, db_url=CONNECTION_STRING
)
record_manager.create_schema()

retriever

MultiVectorRetriever(vectorstore=<langchain_postgres.vectorstores.PGVector object at 0x0000020554E92490>, docstore=<utils.store.PostgresByteStore object at 0x0000020551998A10>)

In [4]:
from utils.utils import generate_reproducible_id_by_content

# Add a reproducible unique doc_id to each document's metadata
for position, doc in enumerate(documents):
    doc.metadata["doc_id"] = generate_reproducible_id_by_content(doc.page_content, doc.metadata)

In [5]:
documents

[Document(page_content='Things to Do in Toronto \nPage 1: Introduction \nToronto, the capital of Ontario, is the largest city in Canada and a dynamic, cosmopolitan \nhub. Known for its towering skyline, bustling waterfront, and numerous cultural attractions, \nToronto o Ưers a wealth of experiences for every visitor. Very Nice \nKey Attractions:  \n\uf0b7 CN Tower:  This iconic symbol of Toronto o Ưers panoramic views of the city. Don’t \nmiss the glass ﬂoor and the revolving restaurant at the top. \n\uf0b7 Royal Ontario Museum (ROM):  Canada’s largest museum of world cultures and \nnatural history is a must-visit. \n\uf0b7 Toronto Islands:  A group of small islands located just o Ư the city’s shore, o Ưering \nbeautiful beaches, picnic spots, and bike rentals.', metadata={'source': 'data\\toronto.pdf', 'page': 0, 'doc_id': 'ebbddde6-aaed-5a6b-bed6-39a114d407b1'}),
 Document(page_content='Page 2: Cultural Experiences \nToronto is a melting pot of cultures, and this is reﬂected in its n

In [6]:
# Generate the list of (doc_id, document) tuples from the documents
doc_id_document_tuples = [(doc.metadata["doc_id"], doc, doc.metadata["source"]) for doc in documents]

# Pass the list of tuples to retriever.docstore.conditional_mset
parent_docs_operations = retriever.docstore.conditional_mset(doc_id_document_tuples)

In [7]:
parent_docs_operations

[('eac9bbc7-a391-5931-a26c-11d9ee2402aa', 'DEL'),
 ('ebbddde6-aaed-5a6b-bed6-39a114d407b1', 'INS'),
 ('4d722603-1c85-56ab-82f2-2d4dfdd3eb68', 'SKIP'),
 ('80e1944b-044e-5dc8-899f-7a941f1fa08b', 'SKIP'),
 ('a9cace25-2615-59b1-9669-ccd6656ac767', 'SKIP')]

### Creating Smaller Documents

In [8]:
from sqlalchemy import create_engine, Column, String, LargeBinary, select, Table, MetaData
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import JSONB
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

separators = ["\n\n", "\n", ".", "?", "!"]

# Initialize the RecursiveCharacterTextSplitter with fixed parameters
child_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    separators=separators
)

# List to store all sub-documents
all_sub_docs = []

# Database connection setup
engine = create_engine(CONNECTION_STRING)
Session = sessionmaker(bind=engine)
session = Session()

# Define table structure
metadata = MetaData()
langchain_pg_embedding = Table(
    'langchain_pg_embedding', metadata,
    Column('id', String, primary_key=True),
    Column('collection_id', String),
    Column('embedding', LargeBinary),
    Column('document', String),
    Column('cmetadata', JSONB)
)

# Create a dictionary to map doc_id to documents
documents_dict = {doc.metadata['doc_id']: doc for doc in documents}

# Sort the parent documents operations to ensure deterministic processing order
parent_docs_operations = sorted(parent_docs_operations, key=lambda x: x[0])

# Iterate through the operations
for doc_id, operation in parent_docs_operations:
    if operation == 'SKIP':
        # Fetch records from langchain_pg_embedding table for SKIP documents
        query = select(
            langchain_pg_embedding.c.id,
            langchain_pg_embedding.c.collection_id,
            langchain_pg_embedding.c.embedding,
            langchain_pg_embedding.c.document,
            langchain_pg_embedding.c.cmetadata
        ).where(
            (langchain_pg_embedding.c.cmetadata['doc_id'].astext == doc_id) &
            (langchain_pg_embedding.c.cmetadata['type'].astext == 'smaller chunk')
        ).order_by(langchain_pg_embedding.c.id)  # Ensure fixed order

        result = session.execute(query).fetchall()

        # Recreate sub-documents from fetched records
        for row in result:
            metadata = row.cmetadata
            sub_doc_content = row.document
            sub_doc = Document(page_content=sub_doc_content, metadata=metadata)
            all_sub_docs.append(sub_doc)
    elif doc_id in documents_dict:
        # Retrieve the document from the provided documents for non-SKIP documents
        doc = documents_dict[doc_id]
        source = doc.metadata.get("source")  # Retrieve the source from the document's metadata
        sub_docs = child_text_splitter.split_documents([doc])
        # Ensure fixed order for sub-documents
        sub_docs = sorted(sub_docs, key=lambda x: x.page_content)
        for sub_doc in sub_docs:
            sub_doc.metadata["doc_id"] = doc_id  # Assign the same doc_id to each sub-document
            sub_doc.metadata["source"] = f"{source}(smaller chunk)"  # Add the suffix to the source
            sub_doc.metadata["type"] = "smaller chunk"
        all_sub_docs.extend(sub_docs)

# Close the session after use
session.close()

# The resulting sub-documents

# The resulting sub-documents
all_sub_docs

[Document(page_content='\uf0b7 Distillery District:  Known for its well-preserved Victorian Industrial architecture, it’s \nnow home to boutiques, art galleries, and performance spaces. \nFestivals:  \n\uf0b7 Caribana:  A festival celebrating Caribbean culture and traditions, held in summer. \n\uf0b7 Toronto International Film Festival (TIFF):  One of the most prestigious ﬁlm \nfestivals in the world, held annually in September.', metadata={'page': 1, 'type': 'smaller chunk', 'doc_id': '4d722603-1c85-56ab-82f2-2d4dfdd3eb68', 'source': 'data\\toronto.pdf(smaller chunk)'}),
 Document(page_content='Page 2: Cultural Experiences \nToronto is a melting pot of cultures, and this is reﬂected in its neighborhoods and festivals. \nNeighborhoods:  \n\uf0b7 Chinatown:  One of North America’s largest Chinatowns, known for its vibrant food \nscene. \n\uf0b7 Kensington Market:  A bohemian neighborhood o Ưering vintage shops, eclectic \nboutiques, and international food stalls.', metadata={'page': 1, 

In [9]:
len(all_sub_docs)

9

In [10]:
idx = index_with_ids(all_sub_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")


In [11]:
idx

{'status': 'success',
 'ids': [{'key': '14f23328-f92f-56da-b537-791a27cb7b8c', 'operation': 'SKIP'},
  {'key': '503d7916-e17c-560b-8f6d-fb43883ba7bf', 'operation': 'SKIP'},
  {'key': '096a5f55-5a31-587c-a137-1e9130286d32', 'operation': 'SKIP'},
  {'key': '5a393ada-9b12-5354-b38f-295b353ce558', 'operation': 'SKIP'},
  {'key': '55c17723-722d-53da-ad32-56d0c6e00111', 'operation': 'SKIP'},
  {'key': '93f3c607-7894-540f-b0b2-b65d5bef0007', 'operation': 'SKIP'},
  {'key': '9c175fca-bd5e-53ea-b5cd-37bd9e597ec7', 'operation': 'INS'},
  {'key': '4284f8a3-ca85-5380-bdc6-ee84fd3ea87d', 'operation': 'INS'},
  {'key': '33ca6a23-fa45-55ba-be60-6f17ba8b10bc', 'operation': 'INS'},
  {'key': 'de64f070-8219-5b39-8032-cc0f8b38129c', 'operation': 'DEL'},
  {'key': 'f5d04788-e830-5f9e-a59b-6f868320a1aa', 'operation': 'DEL'}],
 'results': [{'num_added': 3,
   'num_updated': 0,
   'num_skipped': 6,
   'num_deleted': 2}]}

### Creating Summaries for Each Parent Chunk

In [12]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

prompt_text = """You are an assistant tasked with summarizing and rephrasing text to maintain the original intent of the document. Rephrase the following text chunk in its original language, ensuring to preserve the original meaning and context, even if it comes from a chart or table: \n\n{element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Initialize the Language Model (LLM)
model = ChatOpenAI(temperature=0, model="gpt-4o")

# Define the summary chain
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [13]:
from sqlalchemy import create_engine, Column, String, LargeBinary, select, Table, MetaData
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import JSONB
from langchain.schema.document import Document

# List to store all summary documents
summary_docs = []

# Database connection setup
engine = create_engine(CONNECTION_STRING)
Session = sessionmaker(bind=engine)
session = Session()

# Define table structure
metadata = MetaData()
langchain_pg_embedding = Table(
    'langchain_pg_embedding', metadata,
    Column('id', String, primary_key=True),
    Column('collection_id', String),
    Column('embedding', LargeBinary),
    Column('document', String),
    Column('cmetadata', JSONB)
)

# Create a dictionary to map doc_id to documents
documents_dict = {doc.metadata['doc_id']: doc for doc in documents}

# Collect parent chunks and associated document IDs for documents that are not SKIP and exist in documents_dict
non_skip_docs = [(documents_dict[doc_id], doc_id) for doc_id, operation in parent_docs_operations if operation != 'SKIP' and doc_id in documents_dict]
skip_doc_ids = [doc_id for doc_id, operation in parent_docs_operations if operation == 'SKIP']

# Generate summaries for the parent chunks that are not SKIP
parent_chunk = [doc.page_content for doc, _ in non_skip_docs]
text_summaries = summarize_chain.batch(parent_chunk, {"max_concurrency": 5})
text_summaries_iter = iter(text_summaries)

# Dictionary to store summaries temporarily
temp_summary_docs = {}

# Process non-SKIP documents and store their summaries
for doc, doc_id in non_skip_docs:
    source = doc.metadata.get("source")
    page = doc.metadata.get("page")
    summary_content = next(text_summaries_iter)
    summary_doc = Document(page_content=summary_content, metadata={
        "doc_id": doc_id,
        "source": f"{source}(summary)",
        "page": page,
        "type": "summary"
    })
    temp_summary_docs[doc_id] = summary_doc

# Process SKIP documents and store their summaries
for doc_id in skip_doc_ids:
    query = select(
        langchain_pg_embedding.c.id,
        langchain_pg_embedding.c.collection_id,
        langchain_pg_embedding.c.embedding,
        langchain_pg_embedding.c.document,
        langchain_pg_embedding.c.cmetadata
    ).where(
        (langchain_pg_embedding.c.cmetadata['doc_id'].astext == doc_id) &
        (langchain_pg_embedding.c.cmetadata['type'].astext == 'summary')
    )

    result = session.execute(query).fetchall()

    if not result:
        print(f"No result found for SKIP doc_id {doc_id}")
    else:
        for row in result:
            metadata = row.cmetadata
            summary_content = row.document
            summary_doc = Document(page_content=summary_content, metadata=metadata)
            temp_summary_docs[doc_id] = summary_doc

# Combine the summaries into the final summary_docs list
for doc in documents:
    doc_id = doc.metadata['doc_id']
    if doc_id in temp_summary_docs:
        summary_docs.append(temp_summary_docs[doc_id])
    else:
        # Handle the case where no summary was found or generated
        print(f"No summary found for document ID {doc_id}")

# Close the session after use
session.close()

In [14]:
summary_docs


[Document(page_content='Things to Do in Toronto\nPage 1: Introduction\nToronto, the capital of Ontario, is the largest city in Canada and a vibrant, cosmopolitan center. Renowned for its impressive skyline, lively waterfront, and numerous cultural attractions, Toronto offers a variety of experiences for every visitor. Very Nice\nKey Attractions:\n- CN Tower: This iconic symbol of Toronto provides panoramic views of the city. Don’t miss the glass floor and the revolving restaurant at the top.\n- Royal Ontario Museum (ROM): Canada’s largest museum of world cultures and natural history is a must-visit.\n- Toronto Islands: A group of small islands located just off the city’s shore, offering beautiful beaches, picnic spots, and bike rentals.', metadata={'doc_id': 'ebbddde6-aaed-5a6b-bed6-39a114d407b1', 'source': 'data\\toronto.pdf(summary)', 'page': 0, 'type': 'summary'}),
 Document(page_content='Page 2: Cultural Experiences\nToronto is a diverse cultural hub, evident in its neighborhoods a

In [15]:
idx = index_with_ids(summary_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")


In [16]:
idx

{'status': 'success',
 'ids': [{'key': '7dae084f-d577-5228-ab58-abcdd6b0f6d4', 'operation': 'SKIP'},
  {'key': 'e90c4c41-f1b3-5514-9bb4-f25906fe232d', 'operation': 'SKIP'},
  {'key': '42ecd646-2e6b-581c-8a31-4e0cabda14c9', 'operation': 'SKIP'},
  {'key': 'df534562-28fe-5b56-b1c1-6a35f60d37be', 'operation': 'INS'},
  {'key': '80c736a3-8c42-502c-8cf9-9a596496c17f', 'operation': 'DEL'}],
 'results': [{'num_added': 1,
   'num_updated': 0,
   'num_skipped': 3,
   'num_deleted': 1}]}

### Generating Hypothetical Questions for Each Parent Chunk

In [17]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [18]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

question_chain = (
    {"doc": lambda x: x.page_content}
    # Only asking for 5 hypothetical questions, but this could be adjusted
    | ChatPromptTemplate.from_template(
#         """Generate a list of exactly 5 hypothetical questions that the below document could be used to answer, in the original language of the text:\n\n{doc}
# Separate each question with a comma (,)
#         """
"""Générez une liste de 5 questions hypothétiques que le document ci-dessous pourrait être utilisé pour répondre par a des questions sur le document, dans la langue originale du texte. Le document est extrait d'un PPTX ou PDF et peut nécessiter l'analyse du contexte et de la signification. Si le texte est trop court pour générer 5 questions, fournissez autant de questions que le texte le permet.
\nLe texte généré sera utilisé pour effectuer des recherches de similarité dans une base de données vectorielle.
\n\n{doc}\n\n

Séparez chaque question par une virgule (,).
"""
    )
    | ChatOpenAI(max_retries=0, model="gpt-4o").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [19]:
from sqlalchemy import create_engine, Column, String, LargeBinary, select, Table, MetaData
from sqlalchemy.orm import sessionmaker
from sqlalchemy.dialects.postgresql import JSONB
from langchain.schema.document import Document

# List to store all question documents
question_docs = []

# Database connection setup
engine = create_engine(CONNECTION_STRING)
Session = sessionmaker(bind=engine)
session = Session()

# Define table structure
metadata = MetaData()
langchain_pg_embedding = Table(
    'langchain_pg_embedding', metadata,
    Column('id', String, primary_key=True),
    Column('collection_id', String),
    Column('embedding', LargeBinary),
    Column('document', String),
    Column('cmetadata', JSONB)
)

# Create a dictionary to map doc_id to documents
documents_dict = {doc.metadata['doc_id']: doc for doc in documents}

# Separate non-SKIP and SKIP document IDs
non_skip_docs = [(documents_dict[doc_id], doc_id) for doc_id, operation in parent_docs_operations if operation != 'SKIP' and doc_id in documents_dict]
skip_doc_ids = [doc_id for doc_id, operation in parent_docs_operations if operation == 'SKIP']

# Generate hypothetical questions for the parent documents that are not SKIP
parent_documents = [doc for doc, _ in non_skip_docs]

# Debugging: Check the number of parent documents
print(f"Number of non-SKIP documents: {len(parent_documents)}")

# Check if parent_documents is not empty
if parent_documents:
    hypothetical_questions = question_chain.batch(parent_documents, {"max_concurrency": 5})
    hypothetical_questions_iter = iter(hypothetical_questions)
else:
    hypothetical_questions_iter = iter([])

# Dictionary to store questions temporarily
temp_question_docs = {}

# Process non-SKIP documents and store their questions
for doc, doc_id in non_skip_docs:
    source = doc.metadata.get("source")
    page = doc.metadata.get("page")

    try:
        question_list = next(hypothetical_questions_iter)
    except StopIteration:
        print(f"No more questions available for doc_id {doc_id}")
        continue

    for question_content in question_list:
        question_doc = Document(page_content=question_content, metadata={
            "doc_id": doc_id,
            "source": f"{source}(question)",
            "page": page,
            "type": "question"
        })
        if doc_id not in temp_question_docs:
            temp_question_docs[doc_id] = []
        temp_question_docs[doc_id].append(question_doc)

# Process SKIP documents and store their questions
for doc_id in skip_doc_ids:
    query = select(
        langchain_pg_embedding.c.id,
        langchain_pg_embedding.c.collection_id,
        langchain_pg_embedding.c.embedding,
        langchain_pg_embedding.c.document,
        langchain_pg_embedding.c.cmetadata
    ).where(
        (langchain_pg_embedding.c.cmetadata['doc_id'].astext == doc_id) &
        (langchain_pg_embedding.c.cmetadata['type'].astext == 'question')
    )

    result = session.execute(query).fetchall()

    if result:
        questions = []
        for row in result:
            metadata = row.cmetadata
            question_content = row.document
            question_doc = Document(page_content=question_content, metadata=metadata)
            questions.append(question_doc)
        temp_question_docs[doc_id] = questions

# Combine the questions into the final question_docs list
for doc in documents:
    doc_id = doc.metadata['doc_id']
    if doc_id in temp_question_docs:
        question_docs.extend(temp_question_docs[doc_id])

# Close the session after use
session.close()

# The resulting question documents
question_docs


Number of non-SKIP documents: 1


[Document(page_content='Quels sont les principaux symboles emblématiques de Toronto?', metadata={'doc_id': 'ebbddde6-aaed-5a6b-bed6-39a114d407b1', 'source': 'data\\toronto.pdf(question)', 'page': 0, 'type': 'question'}),
 Document(page_content='Quelle est la plus grande ville du Canada?', metadata={'doc_id': 'ebbddde6-aaed-5a6b-bed6-39a114d407b1', 'source': 'data\\toronto.pdf(question)', 'page': 0, 'type': 'question'}),
 Document(page_content="Quels types d'attractions culturelles peut-on trouver à Toronto?", metadata={'doc_id': 'ebbddde6-aaed-5a6b-bed6-39a114d407b1', 'source': 'data\\toronto.pdf(question)', 'page': 0, 'type': 'question'}),
 Document(page_content='Quelles activités peut-on faire sur les îles de Toronto?', metadata={'doc_id': 'ebbddde6-aaed-5a6b-bed6-39a114d407b1', 'source': 'data\\toronto.pdf(question)', 'page': 0, 'type': 'question'}),
 Document(page_content="Quelle est l'importance du Royal Ontario Museum dans le contexte culturel canadien?", metadata={'doc_id': 'ebb

In [None]:
page_contents = []
for doc in question_docs:
    page_contents.append(doc.page_content)

print(page_contents)

In [20]:
idx = index_with_ids(question_docs, record_manager, vectorstore, cleanup="incremental",
                                          source_id_key="source")



In [21]:
idx

{'status': 'success',
 'ids': [{'key': 'a40bc2d2-7ce4-5094-b726-d3486398895c', 'operation': 'SKIP'},
  {'key': 'bdd535a9-a955-5c55-ab0d-07a30a0fb065', 'operation': 'SKIP'},
  {'key': 'e4816356-4cb9-54a1-b878-bf4550a0b880', 'operation': 'SKIP'},
  {'key': 'e27719f7-7715-5b34-9275-a466f52ebfd3', 'operation': 'SKIP'},
  {'key': '720c46ff-fd12-55f3-aeec-42b996cd7fc2', 'operation': 'SKIP'},
  {'key': 'dacd36fb-4e46-5eef-9227-78c323613893', 'operation': 'SKIP'},
  {'key': '91976869-41e3-50eb-9a03-d297913891b4', 'operation': 'SKIP'},
  {'key': '86f2cc0c-e5f5-555f-8721-897577bd7e00', 'operation': 'SKIP'},
  {'key': 'cd05a4e2-15b9-5301-bc8e-5d62db321ba4', 'operation': 'SKIP'},
  {'key': '3f762943-55aa-56ef-a920-b92136ad1276', 'operation': 'SKIP'},
  {'key': 'e0fa3be8-40e4-5e58-8958-6d3714e87241', 'operation': 'SKIP'},
  {'key': '57adb96b-bd2c-5d9e-a2bf-308575722445', 'operation': 'SKIP'},
  {'key': 'c585f4ac-5c11-5e12-b6ef-325301c36d31', 'operation': 'SKIP'},
  {'key': '06379860-5377-5e7c-b5a1-

### Creating an LCEL Chain and Testing the Retriever

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Prompt template
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4o")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
chain.invoke("Où doit-on donner son avis ?")


In [None]:
chain.invoke("Quelle est l'adresse courriel mentionnée dans le document ?")