# Question transformations

# Splitting and ingesting the content of various URLs (across UK destinations)

### Preparing the Chroma DB collections

In [None]:
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

OPENAI_API_KEY = "TEST"

In [None]:
granular_collection = Chroma(
    collection_name="granular",
    embedding_function=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"),
)

granular_collection.reset_collection()

### Splitting and ingesting HTML content with the HTMLSectionSplitter 

In [None]:
from langchain_text_splitters import HTMLSectionSplitter
from langchain_community.document_loaders import AsyncHtmlLoader

In [None]:
destinations = [
    "Tokyo",
    "Hiroshima",
    "Kanazawa",
    "Kyoto",
    "Nagasaki",
    "Nara",
    "Osaka",
    "Sapporo",
    "Sendai",
]

wikivoyage_root_url = "https://en.wikivoyage.org/wiki"

In [None]:
destination_urls = [f"{wikivoyage_root_url}/{d}" for d in destinations]

In [None]:
headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]
html_section_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on)

In [None]:
def split_docs_into_granular_chunks(docs):
    all_chunks = []
    for doc in docs:
        html_string = doc.page_content  # B
        temp_chunks = html_section_splitter.split_text(html_string)  # C
        h2_temp_chunks = [
            chunk for chunk in temp_chunks if "Header 2" in chunk.metadata
        ]  # D
        all_chunks.extend(h2_temp_chunks)

    return all_chunks

In [None]:
for destination_url in destination_urls:
    html_loader = AsyncHtmlLoader(destination_url)
    docs = html_loader.load()

    for doc in docs:
        print(doc.metadata)
        granular_chunks = split_docs_into_granular_chunks(docs)
        granular_collection.add_documents(documents=granular_chunks)

# A In case it exists
# B Extract the HTML text from the document
# C Each chunk is a H1 or H2 HTML section
# D Only keep content associated with H2 sections
# E Loader for one destination
# F Documents of one destination

# Rewrite-retrieve-read

## Retrieving content with original user question

In [None]:
user_question = "Tell me some fun things I can enjoy in Kyoto"
initial_results = granular_collection.similarity_search(query=user_question, k=4)
for doc in initial_results:
    print(doc)

In [None]:
# COMMENT: the retrieval from the vector store against the original question is bad

## Question rewrite

### Setting up the query rewriter chain

In [None]:
from langchain_ollama.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [None]:
llm = ChatOllama(model="gemma3:1b")

In [None]:
rewriter_prompt_template = """
Generate search query for the Chroma DB vector store from a user question, allowing for a more accurate response through semantic search.
Just return the revised Chroma DB query, with quotes around it. 

User question: {user_question}
Revised Chroma DB query:
"""

rewriter_prompt = ChatPromptTemplate.from_template(rewriter_prompt_template)

In [None]:
rewriter_chain = rewriter_prompt | llm | StrOutputParser()

### Retrieving content with the rewritten query

In [None]:
user_question = "Tell me some fun things I can do in Kyoto"

search_query = rewriter_chain.invoke({"user_question": user_question})
print(search_query)

In [None]:
improved_results = granular_collection.similarity_search(query=search_query, k=3)
for doc in improved_results:
    print(doc)

### Combining everything in a single RAG chain

In [None]:
from langchain_core.runnables import RunnablePassthrough

In [None]:
retriever = granular_collection.as_retriever()

rag_prompt_template = """
Given a question and some context, answer the question.
If you do not know the answer, just say I do not know.

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(rag_prompt_template)

rewrite_retrieve_read_rag_chain = (
    {
        "context": {"user_question": RunnablePassthrough()}
        | rewriter_chain
        | retriever,  # A
        "question": RunnablePassthrough(),  # B
    }
    | rag_prompt
    | llm
    | StrOutputParser()
)
# A The context is returned by the retriver after feeding to it the rewritten query
# B This is the original user question

In [None]:
user_question = "Tell me some fun things I can do in Kyoto"

answer = rewrite_retrieve_read_rag_chain.invoke(user_question)
print(answer)