# Day 3: Making Answers Bright with Multi-Query RAG Magic

In [49]:
!wget https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv

--2024-12-10 20:12:46--  https://raw.githubusercontent.com/amankharwal/Website-data/master/bbc-news-data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5080260 (4.8M) [text/plain]
Saving to: ‘bbc-news-data.csv.3’

bbc-news-data.csv.3   0%[                    ]       0  --.-KB/s               

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



2024-12-10 20:12:46 (51.6 MB/s) - ‘bbc-news-data.csv.3’ saved [5080260/5080260]



In [50]:
import csv
from typing import List

from haystack import Document
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy
from haystack.components.writers import DocumentWriter
from haystack import Pipeline

## Parse and index the news

In [51]:
def read_documents(file: str) -> List[Document]:
    with open(file, "r") as file:
        reader = csv.reader(file, delimiter="\t")
        next(reader, None)  # skip the headers
        docs = []
        for row in reader:
            category = row[0].strip()
            title = row[2].strip()
            text = row[3].strip()
            docs.append(Document(content=text, meta={"category": category, "title": title}))

    return docs

In [52]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
doc_store = InMemoryDocumentStore()

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(model=embedding_model))
indexing_pipeline.add_component("writer", DocumentWriter(doc_store, policy=DuplicatePolicy.OVERWRITE))
indexing_pipeline.connect("embedder", "writer")

documents = read_documents("bbc-news-data.csv")
indexing_pipeline.run({"embedder":{"documents": documents}})

Batches:   0%|          | 0/70 [00:00<?, ?it/s]

{'writer': {'documents_written': 2225}}

## Define a custom component MultiQueryGenerator 

In [53]:
query_generator_template = """
You are an AI language model assistant. Your task is to generate a different version of the
given user question by expanding the meaning of it.
By generating different perspective on the user question, you will help gather diverse information that will be useful to answer the user question in more comprehensive manner.
The generated question should be concise. Do not just rephrase the question, think about the other topics that are relevent to the user question.

Provide alternative question only.
Original question: {{question}}
Alternative: question:
"""

In [54]:
from haystack import component
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders import PromptBuilder

@component
class MultiQueryGenerator:
    def __init__(self, prompt_builder: PromptBuilder = None, generator: OpenAIGenerator = None):
        # You need to define a Generator and a PromptBuilder to pass to that generator
        # The template of the PromptBuilder will have two variables:
        #    - 'query' a string,
        #    - 'n_variations' the number of variations of the query string to generate
        if prompt_builder is None:
            prompt_builder = PromptBuilder(template=query_generator_template, required_variables=["question"])
        if generator is None:
            generator = OpenAIGenerator()

        self.prompt_builder = prompt_builder
        self.generator = generator

    @component.output_types(queries=List[str])
    def run(self, query: str, n_variations: int = 3):
        # You need build a prompt filling in the variables 'query' and 'n_variations'
        prompt_dict = self.prompt_builder.run(template_variables={"question": query})
        # This prompt is then pased to a generator, and you need to collect the result
        # You should return a List[str] with the original query, plus, the 'n_variations' generated by the LLM
        print(f"{query=}")
        generated_queries = self.generator.run(**prompt_dict, generation_kwargs={"n": n_variations})
        print(f"{generated_queries["replies"]}")
        queries = [query] + generated_queries["replies"]
        return {"queries": queries}

In [55]:
# multi_query_generator = MultiQueryGenerator()
# multi_query_generator.run("What is popular in the music industry today?")

In [56]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers import InMemoryEmbeddingRetriever

@component
class MultiQueryHandler:
    def __init__(self, document_store: InMemoryDocumentStore, embedding_model: str):
        # Initialize your SentenceTransformersTextEmbedder and InMemoryEmbeddingRetriever here
        self.retriever = InMemoryEmbeddingRetriever(document_store)
        self.embedder = SentenceTransformersTextEmbedder(model=embedding_model)
        # Ensure that the embedding model used for indexing is the same one used for querying in SentenceTransformersTextEmbedder

    @component.output_types(answers=List[Document])
    def run(self, queries: List[str], top_k: int = 3):
        # You need to initialize an embedder to embed each query in `queries`
        self.embedder.warm_up()
        query_embeddings = [self.embedder.run(query)["embedding"] for query in queries]
        # Each query will be used to retrieve a List[Document] from the document_store
        documents = [document for query_embedding in query_embeddings for document in self.retriever.run(query_embedding, top_k=top_k)["documents"]]
        # You then need to pack all those into a single List[Document] and return it
        return {"answers": documents}

In [57]:
# multi_query_handler = MultiQueryHandler(document_store=doc_store, embedding_model=embedding_model)
# multi_query_handler.run(["What is popular in the music industry today?", "Which artists are trending in the music industry today?"])

## Define the RAG Pipeline with Multi-Query Retrieval

In [58]:
from haystack.components.builders import PromptBuilder, AnswerBuilder
from haystack.components.generators import OpenAIGenerator
from haystack.components.joiners import DocumentJoiner

In [59]:
template = """
You have to answer the following question based on the given context information only.
If the context is empty or just a '\\n' answer with None, example: "None".

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""

In [60]:
pipeline = Pipeline()

# add components
pipeline.add_component("multi_query_generator", MultiQueryGenerator())
pipeline.add_component("multi_query_handler", MultiQueryHandler(document_store=doc_store, embedding_model=embedding_model))
pipeline.add_component("reranker", DocumentJoiner(join_mode="reciprocal_rank_fusion"))
pipeline.add_component("prompt_builder", PromptBuilder(template=template))
pipeline.add_component("llm", OpenAIGenerator())
pipeline.add_component("answer_builder", AnswerBuilder())

# connect components
pipeline.connect("multi_query_generator.queries", "multi_query_handler.queries")
pipeline.connect("multi_query_handler.answers", "reranker.documents")
pipeline.connect("reranker", "prompt_builder.documents")
pipeline.connect("prompt_builder", "llm")
pipeline.connect("llm.replies", "answer_builder.replies")
pipeline.connect("llm.meta", "answer_builder.meta")

<haystack.core.pipeline.pipeline.Pipeline object at 0x70261613fce0>
🚅 Components
  - multi_query_generator: MultiQueryGenerator
  - multi_query_handler: MultiQueryHandler
  - reranker: DocumentJoiner
  - prompt_builder: PromptBuilder
  - llm: OpenAIGenerator
  - answer_builder: AnswerBuilder
🛤️ Connections
  - multi_query_generator.queries -> multi_query_handler.queries (List[str])
  - multi_query_handler.answers -> reranker.documents (List[Document])
  - reranker.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)
  - llm.replies -> answer_builder.replies (List[str])
  - llm.meta -> answer_builder.meta (List[Dict[str, Any]])

In [66]:
# question = "Can you give me some suggestions do you have for Christmas presents? Please provide a variety of options."
# question = "What is popular in the music industry today?"
question = "How are cybersecurity threats evolving with new technologies?"
# question = "What does UK do to prevent piracy in music industry?"

In [67]:
n_variations = 3
top_k = 3

result = pipeline.run({
    'multi_query_generator': {'query': question, 'n_variations': n_variations},
     'multi_query_handler': {'top_k': top_k},
     'prompt_builder': {'template_variables': {'question': question}},
     'answer_builder': {'query': question}
     }, include_outputs_from={"multi_query_generator"}
)

query='How are cybersecurity threats evolving with new technologies?'
['In what ways are emerging technologies influencing the nature and complexity of cybersecurity threats, and what measures can organizations implement to mitigate these risks?', 'What are the implications of emerging technologies, like artificial intelligence and blockchain, on the nature and tactics of cybersecurity threats?', 'In what ways are emerging technologies influencing the nature and complexity of cybersecurity threats, and how can organizations adapt their defenses accordingly?']


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [68]:
print("\n\nQuestions:\n")
for q in result['multi_query_generator']['queries']:
    print(q)
print("\n\nAnswer:\n")
print(result['answer_builder']['answers'][0].data)



Questions:

How are cybersecurity threats evolving with new technologies?
In what ways are emerging technologies influencing the nature and complexity of cybersecurity threats, and what measures can organizations implement to mitigate these risks?
What are the implications of emerging technologies, like artificial intelligence and blockchain, on the nature and tactics of cybersecurity threats?
In what ways are emerging technologies influencing the nature and complexity of cybersecurity threats, and how can organizations adapt their defenses accordingly?


Answer:

Cybersecurity threats are evolving as tech-savvy criminals increasingly exploit new technologies to perpetrate crimes. The creation of malware is shifting from random virus production aimed at causing disruption to more sophisticated and targeted attacks designed for direct financial gain. Criminals are now focusing on methods that allow them to con people, steal valuable data, or take over home PCs. 

The categorization of