<a href="https://colab.research.google.com/github/dodeeric/langchain-ai-assistant-with-hybrid-rag/blob/main/BMAE_AI_Assistant_with_hybrid_RAG_v16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AI Assistant (LLM Chatbot) with Hybrid RAG -- With chat history
v1: Hybrid RAG: keyword search (bm25) and semantic search (vector db)


v2: With memory: 1) Reformulate the question for RAG query (contextualize_q_prompt);  2) Add previous Q and A in prompt sent to the LLM.

v3: With PDF indexation

v5: With limited number of messages in chat history

In [None]:
!pip install --upgrade --quiet jq bs4 langchain langchain-community langchain-openai langchain-chroma langchainhub rank_bm25 pypdf

import requests, json, jq, time, bs4
from bs4 import BeautifulSoup
from google.colab import userdata
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader, JSONLoader, PyPDFLoader
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

OPENAI_API_KEY = userdata.get("OPENAI_API_KEY") # To access OpenAI LLM and embedding model via API
LANGCHAIN_API_KEY = userdata.get("LANGCHAIN_API_KEY") # To trace Langchain on Langsmith

%env OPENAI_API_KEY = $OPENAI_API_KEY
%env LANGCHAIN_API_KEY = $LANGCHAIN_API_KEY
%env LANGCHAIN_TRACING_V2 = "true"

# import dotenv
# dotenv.load_dotenv()

## Scrape

In [19]:
# Function to scrape the text and the metadata of a web page

def scrape_web_page(url):
    """
    Name: swp
    Scrape the text and the metadata of a web page
    Input: URL of the page
    Output: list of dictionaries with: url: url, metadata: metadata, text: text
    """

    #filter = "two-third last" # balat / irpa
    #filter = "media-body" # belgica / kbr
    filter = "hproduct commons-file-information-table" # commons / wikimedia: summary or description section

    # Get the page content
    loader = WebBaseLoader(
        web_paths=(url,),
        bs_kwargs=dict(
            parse_only=bs4.SoupStrainer(
                class_=(filter)
            )
        ),
    )
    text = loader.load()
    # Covert Document type into string type
    text = text[0].page_content

    # Get the metadata (open graph from Facebook, og:xxx)
    # Get the HTML code
    response = requests.get(url)
    # Transform the HTML code from a Response object type into a BeautifulSoup object type to be scraped by Beautiful Soup
    soup = BeautifulSoup(response.text, "html.parser")
    # Get the metadata fields
    metadata = {} # Empty dictionary
    # Find all the meta tags in the HTML
    meta_tags = soup.find_all("meta")
    # Loop through the meta tags
    for tag in meta_tags:
        property = tag.get("property")
        content = tag.get("content")
        # Add the property-content pair to the dictionary
        if property and content:
            metadata[property] = content

    # Build JSON string with: url: url, metadata: metadata, text: summary text
    # Create a dictionary
    page = {
        "url": url, # String
        "metadata": metadata, # Dictionary
        "text": text # String
    }

    return page # Dictionary

In [None]:
# METHOD 1: Scrape the URLs from a file and save the results in a JSON file

#ds1:

file_path = "/content/drive/MyDrive/colab/balat-urls-ds1"
#file_path = "/content/drive/MyDrive/colab/belgica-urls-ds1"
#file_path = "/content/drive/MyDrive/colab/commons-urls-ds1"

#ds2:
#file_path = "/content/drive/MyDrive/colab/balat-urls-ds2"
#file_path = "/content/drive/MyDrive/colab/commons-urls-ds2"

with open(f"{file_path}.txt", "r") as urls_file:
    items = []
    for line in urls_file:
        url = line.strip() # Remove spaces at the beginning and at the end of the string
        url = url.replace("\ufeff", "")  # Remove BOM (Byte order mark at the start of a text stream)
        item = scrape_web_page(url)
        print(item)
        items.append(item)
        #time.sleep(1)

# Save the Python list in a JSON file
# json.dump is designed to take the Python objects, not the already-JSONified string. Read docs.python.org/3/library/json.html.
with open(f"{file_path}-swp.json", "w") as json_file:
    json.dump(items, json_file) # That step replaces the accentuated characters (ex: é) by its utf8 codes (ex: \u00e9)
json_file.close()

In [None]:
# METHOD 2: For Belgica: Scrape the URLs automatically generated and save the results in a JSON file

file_path = "/content/drive/MyDrive/colab/belgica-"

items = []

number1 = 10000101
step = 100
number2 = number1 + step

for number in range(number1, number2):
    url = f"https://opac.kbr.be/LIBRARY/doc/SYRACUSE/{number}"
    #print(url)
    item = scrape_web_page(url)
    print(item)
    if item["text"]:
        items.append(item)
        print("saved")
    #time.sleep(1)

# Save the Python list in a JSON file
# json.dump is designed to take the Python objects, not the already-JSONified string. Read docs.python.org/3/library/json.html.
with open(f"{file_path}-{number1}-{number2}-swp.json", "w") as json_file:
    json.dump(items, json_file) # That step replaces the accentuated characters (ex: é) by its utf8 codes (ex: \u00e9)
json_file.close()

In [20]:
# METHOD 3: For Commons: Scrape the URLs from a Commons Category and save the results in a JSON file

file_path = "/content/drive/MyDrive/colab/commons-"

category = "Category:Twenty-fifth_wedding_anniversary_of_King_Leopold_II_of_Belgium_and_Queen_Marie-Henriette_in_1878"

items = []
href_old = ""

# Step 1: Load the HTML content from a webpage
url = f"https://commons.wikimedia.org/wiki/{category}"
response = requests.get(url)
html_content = response.text

# Step 2: Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Step 3: Find all URLs in <a> tags
urls = []
for link in soup.find_all('a'):
    href = link.get('href')
    if href:
        #print(href)
        if href.startswith("/wiki/File:") and href != href_old: # This test because all links are in double!
            urls.append(f"https://commons.wikimedia.org{href}")
            href_old = href

#print("***********************************************************")
# Print all found URLs
#for url in urls:
#    print(url)
#print("***********************************************************")

number_of_pages = len(urls)
print(f"Number of pages to scrape: {number_of_pages}")

i = 1
items = []
for url in urls:
    print(f"{i}/{number_of_pages}")
    url = url.replace("\ufeff", "")  # Remove BOM (Byte order mark at the start of a text stream)
    item = scrape_web_page(url)
    print(item)
    items.append(item)
    #time.sleep(1)
    i = i + 1

# Save the Python list in a JSON file
# json.dump is designed to take the Python objects, not the already-JSONified string. Read docs.python.org/3/library/json.html.
with open(f"{file_path}{category}-swp.json", "w") as json_file:
    json.dump(items, json_file) # That step replaces the accentuated characters (ex: é) by its utf8 codes (ex: \u00e9)
json_file.close()

Number of pages to scrape: 8
1/8
{'url': 'https://commons.wikimedia.org/wiki/File:D%C3%A9putation_des_dames_belges_offrant_un_diad%C3%A8me_%C3%A0_la_reine.jpg', 'metadata': {'og:image': 'https://upload.wikimedia.org/wikipedia/commons/thumb/0/06/D%C3%A9putation_des_dames_belges_offrant_un_diad%C3%A8me_%C3%A0_la_reine.jpg/640px-D%C3%A9putation_des_dames_belges_offrant_un_diad%C3%A8me_%C3%A0_la_reine.jpg', 'og:image:width': '640', 'og:image:height': '463', 'og:title': 'File:Députation des dames belges offrant un diadème à la reine.jpg - Wikimedia Commons', 'og:type': 'website'}, 'text': '\n\n\n\nEmile Delperée: Presentation of the gift to the Queen during the twenty-fifth wedding anniversary in 1878\n\xa0\xa0\n\n\nArtist\n\n\n\n\n\n\nEmile Delperée\n\xa0(1850–1896)\xa0\xa0 \xa0\n\n\n\n\n\nAlternative names\n\nBirth name: Émile Daxhelet; Emile Delperee; Emile Delpérée\n\nDescription\nBelgian painter\n\nDate of birth/death\n\n15 September 1850\xa0\n9 November 1896\xa0\n\nLocation of birth/d

In [None]:
# Open the JSON file to check its content (will produce an error if it's not a correctly formated JSON file)
with open(f"{file_path}-swp.json", "r") as input_file:
    items_read = json.load(input_file)

## Index

In [None]:
# Open the JSON files and load each JSON item one by one in the "documents" variable (type: Document)

file_path1 = "/content/drive/MyDrive/colab/commons-urls-ds1-swp.json"
file_path2 = "/content/drive/MyDrive/colab/balat-urls-ds1-swp.json"
file_path3 = "/content/drive/MyDrive/colab/belgica-urls-ds1-swp.json"
file_path4 = "/content/drive/MyDrive/colab/commons-urls-ds2-swp.json"
file_path5 = "/content/drive/MyDrive/colab/balat-urls-ds2-swp.json"
file_paths = [file_path1, file_path2, file_path3, file_path4, file_path5]

documents = []
for file_path in file_paths:
    loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=False)
    docs = loader.load() # Chunks (JSON items) from the JSON files; list of Documents
    documents = documents + docs # This variable contents all the JSON items

In [None]:
# Open the PDF files and load each page one by one in the "documents" variable (type: Document)

file_path1 = "/content/drive/MyDrive/colab/cdf-fxw.pdf"
file_paths = [file_path1]

for file_path in file_paths:
    loader = PyPDFLoader(file_path)
    pages = loader.load_and_split() # 1 pdf page per chunk
    documents = documents + pages

Run step 1 or step 2:

In [10]:
# STEP 1: Instanciate a Chroma DB and load the data from disk.
collection_name = "bmae"
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large") # 3072 dimensions vectors used to embed the chunks and the questions
vector_db = Chroma(embedding_function=embedding_model, collection_name=collection_name, persist_directory="/content/drive/MyDrive/colab/chromadb")
docs = vector_db.get()
documents= docs["documents"]

In [11]:
print(documents)

['{"url": "https://opac.kbr.be/LIBRARY/doc/SYRACUSE/20772659", "metadata": {"og:locale": "nl_BE", "og:type": "article", "og:title": "Derniers moments de S. M. L\\u00e9opold Ier, 10 d\\u00e9cembre 1865/Laatste ogenblikken van Z. M. Leopold I, 10 december 1865", "og:url": "https://opac.kbr.be/Library/doc/SYRACUSE/20772659/derniers-moments-de-s-m-leopold-ier-10-decembre-1865-laatste-ogenblikken-van-z-m-leopold-i-10-decemb", "og:image": "https://opac.kbr.be/Ils/digitalCollection/DigitalCollectionThumbnailHandler.ashx?documentId=21582580&size=LARGE&fallback=https%3a%2f%2fopac.kbr.be%2fui%2fskins%2fBRDBNEW%2fportal%2ffront%2fimages%2fGeneral%2fDocType%2fIMAG_LARGE.png"}, "text": "\\n\\n\\n\\n\\n\\n\\n\\nTitel\\nDerniers moments de S. M. L\\u00e9opold Ier, 10 d\\u00e9cembre 1865/Laatste ogenblikken van Z. M. Leopold I, 10 december 1865\\r\\n [Photograph]\\t\\r\\n   \\t\\t             \\n\\n\\nAuteur(s)Gh\\u00e9mar Fr\\u00e8res (Activity: 1859 * - 1894 (Bruxelles)) - The most prominent studio 

In [None]:
# STEP 2: ONLY TO EMBED! Instantiate a Chroma DB, embed the JSON items (documents), then save to disk.
collection_name = "bmae"
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large") # 3072 dimensions vectors used to embed the chunks and the questions
vector_db = Chroma.from_documents(documents, embedding_model, collection_name=collection_name, persist_directory="/content/drive/MyDrive/colab/chromadb")
# To check the Chroma vector db (sqlite3):
# $ sqlite3 chroma.sqlite3
# sqlite> .tables ===> List of the tables
# sqlite> select * from collections; ===> Name of the collection & size of the vectors
# sqlite> select * from embeddings; ===> Number of records in the db
# sqlite> select * from embedding_metadata; ===> Display json items

## Retrieve and generate

In [14]:
# LLM chatbot with a hybrid RAG chain:
# (To embed the question, the same model is used as for the data; the model is given in "vector_db".)

llm = ChatOpenAI(model="gpt-4-turbo-2024-04-09", temperature=0)

# Semantic search (vector retriever)
vector_retriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 3}) # Chroma DB

# Keyword search (bm25 retriever)
keyword_retriever = BM25Retriever.from_texts(documents)
keyword_retriever.k = 3

# Ensemble retriever (mix of both retrivers) -- Weights = order of the results!!! [1,0] means: all bm25 first, all vector after...
ensemble_retriever = EnsembleRetriever(retrievers=[keyword_retriever, vector_retriever], weights=[0.5, 0.5])

"""
# Without memory:

# Download prompt template: system prompt + inputs (rag_output + chat_history + question)
prompt = hub.pull("dodeeric/rag-prompt-bmae-with-history")

# Take the text content of each doc, and concatenate them in one string to pass to the prompt (context)
def format_docs_clear_text(docs):
    return "\n\n".join(doc.page_content.encode('utf-8').decode('unicode_escape') for doc in docs)

# Function to display the text content of the prompt in ai_assistant_chain
def print_and_pass(data):
    print(f"Prompt content sent to the LLM: {data}")
    return data

# Langchain chain: the LLM chatbot with hybrid RAG. Type: RunnableSequence (chain) -- How/where is the question pass to the RAG??? In LangSmith, we can see the input (question) of the 3 retreivers
ai_assistant_chain = ({"rag_output": ensemble_retriever | format_docs_clear_text, "chat_history": RunnablePassthrough(), "question": RunnablePassthrough()}
    | prompt
    #| print_and_pass
    | llm
    | StrOutputParser() # Convert to string
)
"""

from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import HumanMessage, AIMessage

from langchain.memory import ConversationBufferWindowMemory

chat_history = []
chat_history2 = ConversationBufferWindowMemory(k=2, return_messages=True)

contextualize_q_system_prompt = """
Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is.
"""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm, ensemble_retriever, contextualize_q_prompt
)

qa_system_prompt = """
You are an artwork specialist. You must assist the users in finding, describing, and displaying artworks related to the Belgian monarchy. \
You first have to search answers in the "Knowledge Base". If no answers are found in the "Knowledge Base", then answer with your own knowledge. \
You have to answer in the same language as the question.
At the end of the answer:
- give a link to a web page about the artwork (see the "url" field).
- display an image of the artwork (see the "og:image" field).

Knowledge Base:

{context}
"""

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

ai_assistant_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

Query the AI Assistant:

In [15]:
question = "Pouvez-vous me montrer le tableau 'La revue des écoles' ?"

In [None]:
question = "Qui a peint ce tableau ?"

In [None]:
question = "Quelle est la dimension du tableau ?"

In [None]:
question = "Pouvez-vous me montrer un tableau de Charles Porion ?"

In [None]:
question = "Quel est la date de naissance du peintre ?"

In [None]:
question = "Camille Van Camp a-t-il fait des croquis pour sa peinture 'La fête patriotique ' ?"

In [None]:
#answer = ai_assistant_chain.invoke(question) # Without memory

In [17]:
output = ai_assistant_chain.invoke({"input": question, "chat_history": chat_history}) # output is a dictionary. output["answer"] is in markdown format.

In [18]:
print(output["answer"])

Le tableau "La revue des écoles en 1878" est une œuvre de Jan Verhas, un peintre belge. Cette peinture illustre la revue des écoles communales de la ville de Bruxelles qui a eu lieu le 23 août 1878, Place des Palais, devant le Palais royal de Bruxelles, à l'occasion des noces d'argent du roi Léopold II et de la reine Marie-Henriette. Le tableau montre diverses personnalités importantes de l'époque, y compris le roi Léopold II et l'archiduc Charles-Louis d'Autriche.

Pour plus de détails, vous pouvez visiter la page suivante : [La revue des écoles en 1878](https://commons.wikimedia.org/wiki/File:Jan_Verhas_(1834-1896)_Optocht_van_de_scholen_in_1878_-_Old_Masters_Museum_Brussel_30-4-2017_11-18-20.JPG)

Voici une image du tableau :
![La revue des écoles en 1878](https://upload.wikimedia.org/wikipedia/commons/thumb/5/5a/Jan_Verhas_%281834-1896%29_Optocht_van_de_scholen_in_1878_-_Old_Masters_Museum_Brussel_30-4-2017_11-18-20.JPG/640px-Jan_Verhas_%281834-1896%29_Optocht_van_de_scholen_in_187

In [None]:
#chat_history.extend([HumanMessage(content=question), output["answer"]])

In [None]:
#chat_history.extend([question, output["answer"]])

In [None]:
chat_history2.save_context({"input": question}, {"output": output["answer"]})

In [None]:
load_memory = chat_history2.load_memory_variables({})

In [None]:
chat_history = load_memory["history"]

In [None]:
print(chat_history)

[HumanMessage(content="Pouvez-vous me montrer le tableau 'La revue des écoles' ?"), AIMessage(content='Le tableau "La revue des écoles en 1878" de Jan Verhas est une œuvre importante qui capture un événement marquant de l\'histoire belge. Réalisée en 1880, cette peinture illustre le défilé de 23.000 élèves des écoles bruxelloises devant le roi Léopold II et la reine Marie-Henriette à l\'occasion de leurs noces d\'argent. L\'événement a eu lieu le 22 août 1878 sur la place des Palais à Bruxelles. Cette œuvre est célèbre pour sa représentation détaillée et vivante de la parade, mettant en avant l\'importance de l\'éducation et la jeunesse dans la société belge de l\'époque.\n\nPour plus d\'informations sur ce tableau, vous pouvez visiter la page suivante : [La revue des écoles en 1878](https://commons.wikimedia.org/wiki/File:Jan_Verhas_(1834-1896)_Optocht_van_de_scholen_in_1878_-_Old_Masters_Museum_Brussel_30-4-2017_11-18-20.JPG)\n\n![La revue des écoles en 1878](https://upload.wikimedia

In [None]:
# Query the vector RAG only
docs = vector_db.similarity_search(question, k=2) # List of Documents; page_content of a Document: string
print(docs)

Tests